From fac276dc123ac6303b117bfecb232f892b469c2e Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 6 Sep 2024 17:47:30 +0200 Subject: [PATCH 01/43] [SPARSE] Add support for cuSPARSE backend --- CMakeLists.txt | 8 +- cmake/FindCompiler.cmake | 4 +- docs/building_the_project_with_dpcpp.rst | 8 +- docs/domains/sparse_linear_algebra.rst | 136 +++++ .../compile_time_dispatching/CMakeLists.txt | 21 +- .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp | 291 ++++++++++ .../run_time_dispatching/CMakeLists.txt | 3 + include/oneapi/mkl/detail/backends.hpp | 29 +- include/oneapi/mkl/detail/backends_table.hpp | 6 + include/oneapi/mkl/sparse_blas.hpp | 3 + .../cusparse/onemkl_sparse_blas_cusparse.hpp | 35 ++ .../detail/cusparse/sparse_blas_ct.hpp | 40 ++ src/config.hpp.in | 1 + src/sparse_blas/backends/CMakeLists.txt | 4 + .../backends/cusparse/CMakeLists.txt | 85 +++ .../backends/cusparse/cusparse_error.hpp | 100 ++++ .../cusparse/cusparse_global_handle.hpp | 63 +++ .../backends/cusparse/cusparse_handles.cpp | 520 ++++++++++++++++++ .../backends/cusparse/cusparse_handles.hpp | 78 +++ .../backends/cusparse/cusparse_helper.hpp | 165 ++++++ .../cusparse/cusparse_scope_handle.cpp | 147 +++++ .../cusparse/cusparse_scope_handle.hpp | 93 ++++ .../backends/cusparse/cusparse_task.hpp | 382 +++++++++++++ .../backends/cusparse/cusparse_wrappers.cpp | 32 ++ .../cusparse/operations/cusparse_spmm.cpp | 296 ++++++++++ .../cusparse/operations/cusparse_spmv.cpp | 323 +++++++++++ .../cusparse/operations/cusparse_spsv.cpp | 263 +++++++++ .../backends/mkl_common/mkl_dispatch.hpp | 37 ++ .../backends/mkl_common/mkl_handles.cxx | 161 ++---- .../backends/mkl_common/mkl_handles.hpp | 2 + .../backends/mkl_common/mkl_spmm.cxx | 30 +- .../backends/mkl_common/mkl_spmv.cxx | 27 +- .../backends/mkl_common/mkl_spsv.cxx | 17 +- .../backends/mklcpu/mklcpu_handles.cpp | 2 +- .../backends/mklcpu/mklcpu_operations.cpp | 4 +- .../backends/mklgpu/mklgpu_handles.cpp | 2 +- .../backends/mklgpu/mklgpu_operations.cpp | 4 +- src/sparse_blas/common_op_verification.hpp | 142 +++++ src/sparse_blas/generic_container.hpp | 67 ++- src/sparse_blas/macros.hpp | 81 +++ src/sparse_blas/sycl_helper.hpp | 80 +++ tests/unit_tests/CMakeLists.txt | 5 + tests/unit_tests/include/test_helper.hpp | 10 + tests/unit_tests/main_test.cpp | 3 +- .../sparse_blas/include/test_common.hpp | 42 +- .../sparse_blas/source/sparse_spmm_buffer.cpp | 6 +- .../sparse_blas/source/sparse_spmm_usm.cpp | 6 +- .../sparse_blas/source/sparse_spmv_buffer.cpp | 6 +- .../sparse_blas/source/sparse_spmv_usm.cpp | 6 +- .../sparse_blas/source/sparse_spsv_buffer.cpp | 11 +- .../sparse_blas/source/sparse_spsv_usm.cpp | 11 +- 51 files changed, 3635 insertions(+), 263 deletions(-) create mode 100644 examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp create mode 100644 include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp create mode 100644 src/sparse_blas/backends/cusparse/CMakeLists.txt create mode 100644 src/sparse_blas/backends/cusparse/cusparse_error.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.cpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_handles.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_helper.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_task.hpp create mode 100644 src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp create mode 100644 src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp create mode 100644 src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp create mode 100644 src/sparse_blas/common_op_verification.hpp create mode 100644 src/sparse_blas/sycl_helper.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 1bd39f188..39ec0f053 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,6 +60,9 @@ option(ENABLE_CUFFT_BACKEND "Enable the cuFFT backend for the DFT interface" OFF option(ENABLE_ROCFFT_BACKEND "Enable the rocFFT backend for the DFT interface" OFF) option(ENABLE_PORTFFT_BACKEND "Enable the portFFT DFT backend for the DFT interface. Cannot be used with other DFT backends." OFF) +# sparse +option(ENABLE_CUSPARSE_BACKEND "Enable the cuSPARSE backend for the SPARSE_BLAS interface" OFF) + set(ONEMKL_SYCL_IMPLEMENTATION "dpc++" CACHE STRING "Name of the SYCL compiler") set(HIP_TARGETS "" CACHE STRING "Target HIP architectures") @@ -102,7 +105,8 @@ if(ENABLE_MKLGPU_BACKEND list(APPEND DOMAINS_LIST "dft") endif() if(ENABLE_MKLCPU_BACKEND - OR ENABLE_MKLGPU_BACKEND) + OR ENABLE_MKLGPU_BACKEND + OR ENABLE_CUSPARSE_BACKEND) list(APPEND DOMAINS_LIST "sparse_blas") endif() @@ -129,7 +133,7 @@ if(CMAKE_CXX_COMPILER OR NOT ONEMKL_SYCL_IMPLEMENTATION STREQUAL "dpc++") string(REPLACE "\\" "/" CMAKE_CXX_COMPILER ${CMAKE_CXX_COMPILER}) endif() else() - if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND + if(ENABLE_CUBLAS_BACKEND OR ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUFFT_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND OR ENABLE_ROCFFT_BACKEND) set(CMAKE_CXX_COMPILER "clang++") elseif(ENABLE_MKLGPU_BACKEND) diff --git a/cmake/FindCompiler.cmake b/cmake/FindCompiler.cmake index 556211999..8aefc2623 100644 --- a/cmake/FindCompiler.cmake +++ b/cmake/FindCompiler.cmake @@ -37,7 +37,7 @@ if(is_dpcpp) # Check if the Nvidia target is supported. PortFFT uses this for choosing default configuration. check_cxx_compiler_flag("-fsycl -fsycl-targets=nvptx64-nvidia-cuda" dpcpp_supports_nvptx64) - if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND) + if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND) list(APPEND UNIX_INTERFACE_COMPILE_OPTIONS -fsycl-targets=nvptx64-nvidia-cuda -fsycl-unnamed-lambda) list(APPEND UNIX_INTERFACE_LINK_OPTIONS @@ -51,7 +51,7 @@ if(is_dpcpp) -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${HIP_TARGETS}) endif() - if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_ROCBLAS_BACKEND + if(ENABLE_CURAND_BACKEND OR ENABLE_CUSOLVER_BACKEND OR ENABLE_CUSPARSE_BACKEND OR ENABLE_ROCBLAS_BACKEND OR ENABLE_ROCRAND_BACKEND OR ENABLE_ROCSOLVER_BACKEND) set_target_properties(ONEMKL::SYCL::SYCL PROPERTIES INTERFACE_COMPILE_OPTIONS "${UNIX_INTERFACE_COMPILE_OPTIONS}" diff --git a/docs/building_the_project_with_dpcpp.rst b/docs/building_the_project_with_dpcpp.rst index e33a78046..808644792 100644 --- a/docs/building_the_project_with_dpcpp.rst +++ b/docs/building_the_project_with_dpcpp.rst @@ -104,6 +104,9 @@ The most important supported build options are: * - ENABLE_CURAND_BACKEND - True, False - False + * - ENABLE_CUSPARSE_BACKEND + - True, False + - False * - ENABLE_NETLIB_BACKEND - True, False - False @@ -183,8 +186,8 @@ Building for CUDA ^^^^^^^^^^^^^^^^^ The CUDA backends can be enabled with ``ENABLE_CUBLAS_BACKEND``, -``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, and -``ENABLE_CUSOLVER_BACKEND``. +``ENABLE_CUFFT_BACKEND``, ``ENABLE_CURAND_BACKEND``, +``ENABLE_CUSOLVER_BACKEND``, and ``ENABLE_CUSPARSE_BACKEND``. No additional parameters are required for using CUDA libraries. In most cases, the CUDA libraries should be found automatically by CMake. @@ -356,6 +359,7 @@ disabled using the Ninja build system: -DENABLE_CUBLAS_BACKEND=True \ -DENABLE_CUSOLVER_BACKEND=True \ -DENABLE_CURAND_BACKEND=True \ + -DENABLE_CUSPARSE_BACKEND=True \ -DBUILD_FUNCTIONAL_TESTS=False ``$ONEMKL_DIR`` points at the oneMKL source directly. The x86 CPU (``MKLCPU``) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index eab5afd56..acff0380f 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -38,3 +38,139 @@ Currently known limitations: ``oneapi::mkl::unimplemented`` exception. - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent synchronizations and copies to the host. + + +cuSPARSE backend +---------------- + +Currently known limitations: + +- Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will + throw an ``oneapi::mkl::unimplemented`` exception. +- The COO format requires the indices to be sorted by row. See the `cuSPARSE + documentation + `_. + + +Operation algorithms mapping +---------------------------- + +The following tables describe how a oneMKL SYCL Interface algorithm maps to the +backend's algorithms. Refer to the backend's documentation for a more detailed +explanation of the algorithms. + +Backends with no equivalent algorithms will fallback to the backend's default +behavior. + + +spmm +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - Value + - Description + - Backend equivalent + * - ``default_optimize_alg`` + - Default algorithm. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - Default algorithm but may skip some optimizations. Useful only if an + operation with the same configuration is run once. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``coo_alg1`` + - Should provide best performance for COO format, small ``nnz`` and + column-major layout. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG1`` + * - ``coo_alg2`` + - Should provide best performance for COO format and column-major layout. + Produces deterministic results. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG2`` + * - ``coo_alg3`` + - Should provide best performance for COO format and large ``nnz``. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG3`` + * - ``coo_alg4`` + - Should provide best performance for COO format and row-major layout. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG4`` + * - ``csr_alg1`` + - Should provide best performance for CSR format and column-major layout. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG1`` + * - ``csr_alg2`` + - Should provide best performance for CSR format and row-major layout. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG2`` + * - ``csr_alg3`` + - Deterministic algorithm for CSR format. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG3`` + + +spmv +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - Value + - Description + - Backend equivalent + * - ``default_alg`` + - Default algorithm. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMV_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - Default algorithm but may skip some optimizations. Useful only if an + operation with the same configuration is run once. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``coo_alg1`` + - Default algorithm for COO format. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG1`` + * - ``coo_alg2`` + - Deterministic algorithm for COO format. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG2`` + * - ``csr_alg1`` + - Default algorithm for CSR format. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG1`` + * - ``csr_alg2`` + - Deterministic algorithm for CSR format. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG2`` + * - ``csr_alg3`` + - LRB variant of the algorithm for CSR format. + - | MKL: none + | cuSPARSE: none + + +spsv +^^^^ + +.. list-table:: + :header-rows: 1 + :widths: 10 30 45 + + * - Value + - Description + - Backend equivalent + * - ``default_optimize_alg`` + - Default algorithm. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + * - ``no_optimize_alg`` + - Default algorithm but may skip some optimizations. Useful only if an + operation with the same configuration is run once. + - | MKL: none + | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` diff --git a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt index 5dbbba8a4..a38f4ebd4 100644 --- a/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt +++ b/examples/sparse_blas/compile_time_dispatching/CMakeLists.txt @@ -18,27 +18,24 @@ #=============================================================================== #Build object from all sources -set(SPARSE_BLAS_BACKENDS "") - -if(ENABLE_MKLCPU_BACKEND) - list(APPEND SPARSE_BLAS_BACKENDS "mklcpu") +set(SPARSE_CT_SOURCES "") +if(ENABLE_MKLCPU_BACKEND AND ENABLE_CUSPARSE_BACKEND) + list(APPEND SPARSE_CT_SOURCES "sparse_blas_spmv_usm_mklcpu_cusparse") endif() include(WarningsUtils) -foreach(backend ${SPARSE_BLAS_BACKENDS}) - set(EXAMPLE_NAME example_sparse_blas_spmv_usm_${backend}) - add_executable(${EXAMPLE_NAME} sparse_blas_spmv_usm_${backend}.cpp) - target_include_directories(${EXAMPLE_NAME} +foreach(sparse_ct_source ${SPARSE_CT_SOURCES}) + add_executable(${sparse_ct_source} ${sparse_ct_source}.cpp) + target_include_directories(${sparse_ct_source} PUBLIC ${PROJECT_SOURCE_DIR}/examples/include PUBLIC ${PROJECT_SOURCE_DIR}/include PUBLIC ${CMAKE_BINARY_DIR}/bin ) - add_dependencies(${EXAMPLE_NAME} onemkl_sparse_blas_${backend}) - target_link_libraries(${EXAMPLE_NAME} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_${backend}) + target_link_libraries(${sparse_ct_source} PRIVATE ONEMKL::SYCL::SYCL onemkl_sparse_blas_mklcpu onemkl_sparse_blas_cusparse) # Register example as ctest - add_test(NAME sparse_blas/EXAMPLE/CT/sparse_blas_spmv_usm_${backend} COMMAND ${EXAMPLE_NAME}) -endforeach(backend) + add_test(NAME sparse_blas/EXAMPLE/CT/${sparse_ct_source} COMMAND ${sparse_ct_source}) +endforeach(sparse_ct_source) diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp new file mode 100644 index 000000000..d025539f8 --- /dev/null +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -0,0 +1,291 @@ +/******************************************************************************* +* Copyright 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +* +* SPDX-License-Identifier: Apache-2.0 +*******************************************************************************/ + +/* +* +* Content: +* This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv +* using unified shared memory to perform general sparse matrix-vector +* multiplication on a INTEL CPU SYCL device. +* +* y = alpha * op(A) * x + beta * y +* +* where op() is defined by one of +* +* oneapi::mkl::transpose::{nontrans,trans,conjtrans} +* +* +* This example demonstrates only single precision (float) data type for +* spmv matrix data +* +* +*******************************************************************************/ + +// stl includes +#include +#include + +#if __has_include() +#include +#else +#include +#endif +#include "oneapi/mkl.hpp" + +#include "example_helper.hpp" + +// +// Main example for Sparse Matrix-Vector Multiply consisting of +// initialization of A matrix, x and y vectors as well as +// scalars alpha and beta. Then the product +// +// y = alpha * op(A) * x + beta * y +// +// is performed and finally the results are post processed. +// +template +int run_sparse_matrix_vector_multiply_example(const selectorType &selector) { + auto queue = selector.get_queue(); + + // Matrix data size + intType size = 4; + intType nrows = size * size * size; + + // Set scalar fpType values + fpType alpha = set_fp_value(fpType(1.0)); + fpType beta = set_fp_value(fpType(0.0)); + + intType *ia, *ja; + fpType *a, *x, *y, *z; + std::size_t sizea = static_cast(27 * nrows); + std::size_t sizeja = static_cast(27 * nrows); + std::size_t sizeia = static_cast(nrows + 1); + std::size_t sizevec = static_cast(nrows); + + ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), queue); + ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), queue); + a = (fpType *)sycl::malloc_shared(sizea * sizeof(fpType), queue); + x = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); + y = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); + z = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); + + if (!ia || !ja || !a || !x || !y || !z) { + throw std::runtime_error("Failed to allocate USM memory"); + } + + intType nnz = generate_sparse_matrix(size, ia, ja, a); + + // Init vectors x and y + for (int i = 0; i < nrows; i++) { + x[i] = set_fp_value(fpType(1.0)); + y[i] = set_fp_value(fpType(0.0)); + z[i] = set_fp_value(fpType(0.0)); + } + + std::vector int_ptr_vec; + int_ptr_vec.push_back(ia); + int_ptr_vec.push_back(ja); + std::vector fp_ptr_vec; + fp_ptr_vec.push_back(a); + fp_ptr_vec.push_back(x); + fp_ptr_vec.push_back(y); + fp_ptr_vec.push_back(z); + + // + // Execute Matrix Multiply + // + + oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans; + oneapi::mkl::sparse::spmv_alg alg = oneapi::mkl::sparse::spmv_alg::default_alg; + oneapi::mkl::sparse::matrix_view A_view; + + std::cout << "\n\t\tsparse::spmv parameters:\n"; + std::cout << "\t\t\ttransA = " + << (transA == oneapi::mkl::transpose::nontrans + ? "nontrans" + : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans")) + << std::endl; + std::cout << "\t\t\tnrows = " << nrows << std::endl; + std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl; + + // Create and initialize handle for a Sparse Matrix in CSR format + oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr; + oneapi::mkl::sparse::init_csr_matrix(selector, &A_handle, nrows, nrows, nnz, + oneapi::mkl::index_base::zero, ia, ja, a); + + // Create and initialize dense vector handles + oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr; + oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr; + oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, sizevec, x); + oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, sizevec, y); + + // Create operation descriptor + oneapi::mkl::sparse::spmv_descr_t descr = nullptr; + oneapi::mkl::sparse::init_spmv_descr(selector, &descr); + + // Allocate external workspace + std::size_t workspace_size = 0; + oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle, + &beta, y_handle, alg, descr, workspace_size); + void *workspace = sycl::malloc_device(workspace_size, queue); + + // Optimize spmv + auto ev_opt = + oneapi::mkl::sparse::spmv_optimize(selector, transA, &alpha, A_view, A_handle, x_handle, + &beta, y_handle, alg, descr, workspace); + + // Run spmv + auto ev_spmv = oneapi::mkl::sparse::spmv(selector, transA, &alpha, A_view, A_handle, x_handle, + &beta, y_handle, alg, descr, { ev_opt }); + + // Release handles and descriptor + std::vector release_events; + release_events.push_back( + oneapi::mkl::sparse::release_dense_vector(selector, x_handle, { ev_spmv })); + release_events.push_back( + oneapi::mkl::sparse::release_dense_vector(selector, y_handle, { ev_spmv })); + release_events.push_back( + oneapi::mkl::sparse::release_sparse_matrix(selector, A_handle, { ev_spmv })); + release_events.push_back(oneapi::mkl::sparse::release_spmv_descr(selector, descr, { ev_spmv })); + for (auto event : release_events) { + event.wait_and_throw(); + } + + // + // Post Processing + // + + fpType *res = y; + const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); + for (intType row = 0; row < nrows; row++) { + z[row] *= beta; + } + for (intType row = 0; row < nrows; row++) { + fpType tmp = alpha * x[row]; + for (intType i = ia[row]; i < ia[row + 1]; i++) { + if constexpr (is_complex()) { + z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); + } + else { + z[ja[i]] += tmp * a[i]; + } + } + } + + bool good = true; + for (intType row = 0; row < nrows; row++) { + good &= check_result(res[row], z[row], nrows, row); + } + + std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished" + << std::endl; + + free_vec(fp_ptr_vec, queue); + free_vec(int_ptr_vec, queue); + + if (!good) + return 1; + + return 0; +} + +// +// Description of example setup, apis used and supported floating point type +// precisions +// +void print_example_banner() { + std::cout << "" << std::endl; + std::cout << "########################################################################" + << std::endl; + std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl; + std::cout << "# " << std::endl; + std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl; + std::cout << "# " << std::endl; + std::cout << "# where A is a sparse matrix in CSR format, x and y are " + "dense vectors" + << std::endl; + std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl; + std::cout << "# " << std::endl; + std::cout << "# Using apis:" << std::endl; + std::cout << "# sparse::spmv" << std::endl; + std::cout << "# " << std::endl; + std::cout << "# Using single precision (float) data type" << std::endl; + std::cout << "# " << std::endl; + std::cout << "# Running on both Intel CPU and Nvidia GPU devices" << std::endl; + std::cout << "# " << std::endl; + std::cout << "########################################################################" + << std::endl; + std::cout << std::endl; +} + +// +// Main entry point for example +// +int main(int /*argc*/, char ** /*argv*/) { + print_example_banner(); + + auto exception_handler = [](sycl::exception_list exceptions) { + for (std::exception_ptr const &e : exceptions) { + try { + std::rethrow_exception(e); + } + catch (sycl::exception const &e) { + std::cout << "Caught asynchronous SYCL " + "exception during sparse::spmv:\n" + << e.what() << std::endl; + } + } + }; + + try { + sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler); + sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler); + unsigned int vendor_id = gpu_queue.get_info(); + if (vendor_id != NVIDIA_ID) { + std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl; + return 1; + } + oneapi::mkl::backend_selector cpu_selector{ cpu_queue }; + oneapi::mkl::backend_selector gpu_selector{ gpu_queue }; + + std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl; + std::cout << "\tCPU device: " << cpu_queue.get_info() + << std::endl; + std::cout << "\tGPU device: " << gpu_queue.get_info() + << std::endl; + std::cout << "Running with single precision real data type:" << std::endl; + + run_sparse_matrix_vector_multiply_example(cpu_selector); + run_sparse_matrix_vector_multiply_example(gpu_selector); + std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl; + } + catch (sycl::exception const &e) { + std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl; + std::cerr << "\t" << e.what() << std::endl; + std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; + return 1; + } + catch (std::exception const &e) { + std::cerr << "Caught std::exception during Sparse SPMV:" << std::endl; + std::cerr << "\t" << e.what() << std::endl; + return 1; + } + + return 0; +} diff --git a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt index 398f3e0f2..f09daf819 100644 --- a/examples/sparse_blas/run_time_dispatching/CMakeLists.txt +++ b/examples/sparse_blas/run_time_dispatching/CMakeLists.txt @@ -33,6 +33,9 @@ endif() if(ENABLE_MKLGPU_BACKEND) list(APPEND DEVICE_FILTERS "level_zero:gpu") endif() +if(ENABLE_CUSPARSE_BACKEND) + list(APPEND DEVICE_FILTERS "cuda:gpu") +endif() message(STATUS "ONEAPI_DEVICE_SELECTOR will be set to the following value(s): [${DEVICE_FILTERS}] for run-time dispatching examples") diff --git a/include/oneapi/mkl/detail/backends.hpp b/include/oneapi/mkl/detail/backends.hpp index 32b7c2614..216a6feba 100644 --- a/include/oneapi/mkl/detail/backends.hpp +++ b/include/oneapi/mkl/detail/backends.hpp @@ -40,20 +40,31 @@ enum class backend { cufft, rocfft, portfft, + cusparse, unsupported }; typedef std::map backendmap; -static backendmap backend_map = { - { backend::mklcpu, "mklcpu" }, { backend::mklgpu, "mklgpu" }, - { backend::cublas, "cublas" }, { backend::cusolver, "cusolver" }, - { backend::curand, "curand" }, { backend::netlib, "netlib" }, - { backend::rocblas, "rocblas" }, { backend::rocrand, "rocrand" }, - { backend::rocsolver, "rocsolver" }, { backend::portblas, "portblas" }, - { backend::cufft, "cufft" }, { backend::rocfft, "rocfft" }, - { backend::portfft, "portfft" }, { backend::unsupported, "unsupported" } -}; +// clang-format alternate the formatting depending on the parity of the number of backends +// It is disabled to reduce noise +// clang-format off +static backendmap backend_map = { { backend::mklcpu, "mklcpu" }, + { backend::mklgpu, "mklgpu" }, + { backend::cublas, "cublas" }, + { backend::cusolver, "cusolver" }, + { backend::curand, "curand" }, + { backend::netlib, "netlib" }, + { backend::rocblas, "rocblas" }, + { backend::rocrand, "rocrand" }, + { backend::rocsolver, "rocsolver" }, + { backend::portblas, "portblas" }, + { backend::cufft, "cufft" }, + { backend::rocfft, "rocfft" }, + { backend::portfft, "portfft" }, + { backend::cusparse, "cusparse" }, + { backend::unsupported, "unsupported" } }; +// clang-format on } //namespace mkl } //namespace oneapi diff --git a/include/oneapi/mkl/detail/backends_table.hpp b/include/oneapi/mkl/detail/backends_table.hpp index 8e68674cc..8a79c5c06 100644 --- a/include/oneapi/mkl/detail/backends_table.hpp +++ b/include/oneapi/mkl/detail/backends_table.hpp @@ -186,6 +186,12 @@ static std::map>> libraries = { #ifdef ENABLE_MKLGPU_BACKEND LIB_NAME("sparse_blas_mklgpu") +#endif + } }, + { device::nvidiagpu, + { +#ifdef ENABLE_CUSPARSE_BACKEND + LIB_NAME("sparse_blas_cusparse") #endif } } } }, }; diff --git a/include/oneapi/mkl/sparse_blas.hpp b/include/oneapi/mkl/sparse_blas.hpp index 912a20eb8..73e6753c7 100644 --- a/include/oneapi/mkl/sparse_blas.hpp +++ b/include/oneapi/mkl/sparse_blas.hpp @@ -34,6 +34,9 @@ #ifdef ENABLE_MKLGPU_BACKEND #include "sparse_blas/detail/mklgpu/sparse_blas_ct.hpp" #endif +#ifdef ENABLE_CUSPARSE_BACKEND +#include "sparse_blas/detail/cusparse/sparse_blas_ct.hpp" +#endif #include "sparse_blas/detail/sparse_blas_rt.hpp" diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp new file mode 100644 index 000000000..6de2802f1 --- /dev/null +++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp @@ -0,0 +1,35 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ +#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ + +#include "oneapi/mkl/detail/export.hpp" +#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp" +#include "oneapi/mkl/sparse_blas/types.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +namespace detail = oneapi::mkl::sparse::detail; + +#include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx" + +} // namespace oneapi::mkl::sparse::cusparse + +#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_ONEMKL_SPARSE_BLAS_CUSPARSE_HPP_ diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp new file mode 100644 index 000000000..11abb9a6f --- /dev/null +++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/sparse_blas_ct.hpp @@ -0,0 +1,40 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ +#define _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ + +#include "oneapi/mkl/detail/backends.hpp" +#include "oneapi/mkl/detail/backend_selector.hpp" + +#include "onemkl_sparse_blas_cusparse.hpp" + +namespace oneapi { +namespace mkl { +namespace sparse { + +#define BACKEND cusparse +#include "oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx" +#undef BACKEND + +} //namespace sparse +} //namespace mkl +} //namespace oneapi + +#endif // _ONEMKL_SPARSE_BLAS_DETAIL_CUSPARSE_SPARSE_BLAS_CT_HPP_ diff --git a/src/config.hpp.in b/src/config.hpp.in index 5698abf9b..fd55006a6 100644 --- a/src/config.hpp.in +++ b/src/config.hpp.in @@ -24,6 +24,7 @@ #cmakedefine ENABLE_CUFFT_BACKEND #cmakedefine ENABLE_CURAND_BACKEND #cmakedefine ENABLE_CUSOLVER_BACKEND +#cmakedefine ENABLE_CUSPARSE_BACKEND #cmakedefine ENABLE_MKLCPU_BACKEND #cmakedefine ENABLE_MKLGPU_BACKEND #cmakedefine ENABLE_NETLIB_BACKEND diff --git a/src/sparse_blas/backends/CMakeLists.txt b/src/sparse_blas/backends/CMakeLists.txt index 294040808..baae9445d 100644 --- a/src/sparse_blas/backends/CMakeLists.txt +++ b/src/sparse_blas/backends/CMakeLists.txt @@ -27,3 +27,7 @@ endif() if(ENABLE_MKLGPU_BACKEND) add_subdirectory(mklgpu) endif() + +if(ENABLE_CUSPARSE_BACKEND) + add_subdirectory(cusparse) +endif() diff --git a/src/sparse_blas/backends/cusparse/CMakeLists.txt b/src/sparse_blas/backends/cusparse/CMakeLists.txt new file mode 100644 index 000000000..60bbaf35f --- /dev/null +++ b/src/sparse_blas/backends/cusparse/CMakeLists.txt @@ -0,0 +1,85 @@ +#=============================================================================== +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# +# SPDX-License-Identifier: Apache-2.0 +#=============================================================================== + +set(LIB_NAME onemkl_sparse_blas_cusparse) +set(LIB_OBJ ${LIB_NAME}_obj) + +include(WarningsUtils) + +add_library(${LIB_NAME}) +add_library(${LIB_OBJ} OBJECT + cusparse_handles.cpp + cusparse_scope_handle.cpp + operations/cusparse_spmm.cpp + operations/cusparse_spmv.cpp + operations/cusparse_spsv.cpp + $<$: cusparse_wrappers.cpp> +) +add_dependencies(onemkl_backend_libs_sparse_blas ${LIB_NAME}) + +target_include_directories(${LIB_OBJ} + PRIVATE ${PROJECT_SOURCE_DIR}/include + ${PROJECT_SOURCE_DIR}/src + ${CMAKE_BINARY_DIR}/bin + ${ONEMKL_GENERATED_INCLUDE_PATH} +) + +target_compile_options(${LIB_OBJ} PRIVATE ${ONEMKL_BUILD_COPT}) + +if (${CMAKE_VERSION} VERSION_LESS "3.17.0") + find_package(CUDA 12.2 REQUIRED) + target_include_directories(${LIB_OBJ} PRIVATE ${CUDA_INCLUDE_DIRS}) + target_link_libraries(${LIB_OBJ} PUBLIC cuda rt ${CUDA_cusparse_LIBRARY}) +else() + find_package(CUDAToolkit 12.2 REQUIRED) + target_link_libraries(${LIB_OBJ} PRIVATE CUDA::cusparse CUDA::cudart CUDA::cuda_driver) +endif() + +target_link_libraries(${LIB_OBJ} + PUBLIC ONEMKL::SYCL::SYCL + PRIVATE onemkl_warnings +) + +set_target_properties(${LIB_OBJ} PROPERTIES + POSITION_INDEPENDENT_CODE ON +) +target_link_libraries(${LIB_NAME} PUBLIC ${LIB_OBJ}) + +#Set oneMKL libraries as not transitive for dynamic +if(BUILD_SHARED_LIBS) + set_target_properties(${LIB_NAME} PROPERTIES + INTERFACE_LINK_LIBRARIES ONEMKL::SYCL::SYCL + ) +endif() + +# Add major version to the library +set_target_properties(${LIB_NAME} PROPERTIES + SOVERSION ${PROJECT_VERSION_MAJOR} +) + +# Add dependencies rpath to the library +list(APPEND CMAKE_BUILD_RPATH $) + +# Add the library to install package +install(TARGETS ${LIB_OBJ} EXPORT oneMKLTargets) +install(TARGETS ${LIB_NAME} EXPORT oneMKLTargets + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib +) diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp new file mode 100644 index 000000000..7d6bf45d7 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp @@ -0,0 +1,100 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ + +#include + +#include +#include + +#include "oneapi/mkl/exceptions.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +inline std::string cuda_result_to_str(CUresult result) { + switch (result) { +#define ONEMKL_CUSPARSE_CASE(STATUS) \ + case STATUS: return #STATUS + ONEMKL_CUSPARSE_CASE(CUDA_SUCCESS); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_NOT_PERMITTED); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_CONTEXT); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_DEVICE); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_INVALID_VALUE); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_OUT_OF_MEMORY); + ONEMKL_CUSPARSE_CASE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); + default: return ""; + } +} + +#define CUDA_ERROR_FUNC(func, ...) \ + do { \ + auto res = func(__VA_ARGS__); \ + if (res != CUDA_SUCCESS) { \ + throw oneapi::mkl::exception("sparse_blas", #func, \ + "cuda error: " + cuda_result_to_str(res)); \ + } \ + } while (0) + +inline std::string cusparse_status_to_str(cusparseStatus_t status) { + switch (status) { +#define ONEMKL_CUSPARSE_CASE(STATUS) \ + case STATUS: return #STATUS + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_SUCCESS); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_INITIALIZED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ALLOC_FAILED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INVALID_VALUE); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_ARCH_MISMATCH); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_EXECUTION_FAILED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INTERNAL_ERROR); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_NOT_SUPPORTED); + ONEMKL_CUSPARSE_CASE(CUSPARSE_STATUS_INSUFFICIENT_RESOURCES); +#undef ONEMKL_CUSPARSE_CASE + default: return ""; + } +} + +inline void check_status(cusparseStatus_t status, const std::string& function, + std::string error_str = "") { + if (status != CUSPARSE_STATUS_SUCCESS) { + if (!error_str.empty()) { + error_str += "; "; + } + error_str += "cuSPARSE status: " + cusparse_status_to_str(status); + switch (status) { + case CUSPARSE_STATUS_NOT_SUPPORTED: + throw oneapi::mkl::unimplemented("sparse_blas", function, error_str); + case CUSPARSE_STATUS_INVALID_VALUE: + throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str); + default: throw oneapi::mkl::exception("sparse_blas", function, error_str); + } + } +} + +#define CUSPARSE_ERR_FUNC(func, ...) \ + do { \ + auto status = func(__VA_ARGS__); \ + check_status(status, #func); \ + } while (0) + +} // namespace oneapi::mkl::sparse::cusparse + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp new file mode 100644 index 000000000..59e582a65 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp @@ -0,0 +1,63 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ + +/** + * @file Similar to blas_handle.hpp + * Provides a map from a ur_context_handle_t (or equivalent) to a cusparseHandle_t. + * @see cusparse_scope_handle.hpp +*/ + +#include +#include + +namespace oneapi::mkl::sparse::cusparse { + +template +struct cusparse_global_handle { + using handle_container_t = std::unordered_map *>; + handle_container_t cusparse_global_handle_mapper_{}; + + ~cusparse_global_handle() noexcept(false) { + for (auto &handle_pair : cusparse_global_handle_mapper_) { + if (handle_pair.second != nullptr) { + auto handle = handle_pair.second->exchange(nullptr); + if (handle != nullptr) { + CUSPARSE_ERR_FUNC(cusparseDestroy, handle); + handle = nullptr; + } + else { + // if the handle is nullptr it means the handle was already + // destroyed by the ContextCallback and we're free to delete the + // atomic object. + delete handle_pair.second; + } + + handle_pair.second = nullptr; + } + } + cusparse_global_handle_mapper_.clear(); + } +}; + +} // namespace oneapi::mkl::sparse::cusparse + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp new file mode 100644 index 000000000..de7236110 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -0,0 +1,520 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "cusparse_error.hpp" +#include "cusparse_helper.hpp" +#include "cusparse_handles.hpp" +#include "cusparse_task.hpp" +#include "sparse_blas/macros.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +/** + * In this file CusparseScopedContextHandler are used to ensure that a cusparseHandle_t is created before any other cuSPARSE call, as required by the specification. +*/ + +// Dense vector +template +void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, + sycl::buffer val) { + auto event = queue.submit([&](sycl::handler &cgh) { + auto acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_value_type = CudaEnumType::value; + cusparseDnVecDescr_t cu_dvhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(acc), + cuda_value_type); + *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); + }); + }); + event.wait_and_throw(); +} + +template +void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, + fpType *val) { + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_value_type = CudaEnumType::value; + cusparseDnVecDescr_t cu_dvhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(val), + cuda_value_type); + *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); + }); + }); + event.wait_and_throw(); +} + +template +void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, + sycl::buffer val) { + detail::check_can_reset_value_handle(__func__, dvhandle, true); + auto event = queue.submit([&](sycl::handler &cgh) { + auto acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (dvhandle->size != size) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, + sc.get_mem(acc), cuda_value_type); + dvhandle->size = size; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, + sc.get_mem(acc)); + } + dvhandle->set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template +void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, + fpType *val) { + detail::check_can_reset_value_handle(__func__, dvhandle, false); + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (dvhandle->size != size) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, + sc.get_mem(val), cuda_value_type); + dvhandle->size = size; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, + sc.get_mem(val)); + } + dvhandle->set_usm_ptr(val); + }); + }); + event.wait_and_throw(); +} + +FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); + +sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, + const std::vector &dependencies) { + // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used + auto functor = [=](CusparseScopedContextHandler &) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + delete dvhandle; + }; + return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle); +} + +// Dense matrix +template +void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, + sycl::buffer val) { + auto event = queue.submit([&](sycl::handler &cgh) { + auto acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_value_type = CudaEnumType::value; + auto cuda_order = get_cuda_order(dense_layout); + cusparseDnMatDescr_t cu_dmhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, + sc.get_mem(acc), cuda_value_type, cuda_order); + *p_dmhandle = + new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); + }); + }); + event.wait_and_throw(); +} + +template +void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType *val) { + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_value_type = CudaEnumType::value; + auto cuda_order = get_cuda_order(dense_layout); + cusparseDnMatDescr_t cu_dmhandle; + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, + sc.get_mem(val), cuda_value_type, cuda_order); + *p_dmhandle = + new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); + }); + }); + event.wait_and_throw(); +} + +template +void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, + oneapi::mkl::layout dense_layout, sycl::buffer val) { + detail::check_can_reset_value_handle(__func__, dmhandle, true); + auto event = queue.submit([&](sycl::handler &cgh) { + auto acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || + dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + auto cuda_order = get_cuda_order(dense_layout); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, + num_cols, ld, sc.get_mem(acc), cuda_value_type, cuda_order); + dmhandle->num_rows = num_rows; + dmhandle->num_cols = num_cols; + dmhandle->ld = ld; + dmhandle->dense_layout = dense_layout; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, + sc.get_mem(acc)); + } + dmhandle->set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template +void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, + oneapi::mkl::layout dense_layout, fpType *val) { + detail::check_can_reset_value_handle(__func__, dmhandle, false); + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || + dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + auto cuda_order = get_cuda_order(dense_layout); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, + num_cols, ld, sc.get_mem(val), cuda_value_type, cuda_order); + dmhandle->num_rows = num_rows; + dmhandle->num_cols = num_cols; + dmhandle->ld = ld; + dmhandle->dense_layout = dense_layout; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, + sc.get_mem(val)); + } + dmhandle->set_usm_ptr(val); + }); + }); + event.wait_and_throw(); +} + +FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); + +sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, + const std::vector &dependencies) { + // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used + auto functor = [=](CusparseScopedContextHandler &) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + delete dmhandle; + }; + return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle); +} + +// COO matrix +template +void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer row_ind, sycl::buffer col_ind, + sycl::buffer val) { + auto event = queue.submit([&](sycl::handler &cgh) { + auto row_acc = row_ind.template get_access(cgh); + auto col_acc = col_ind.template get_access(cgh); + auto val_acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, + sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), + cuda_index_type, cuda_index_base, cuda_value_type); + *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols, + nnz, index); + }); + }); + event.wait_and_throw(); +} + +template +void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType *row_ind, intType *col_ind, fpType *val) { + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, + sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val), + cuda_index_type, cuda_index_base, cuda_value_type); + *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols, + nnz, index); + }); + }); + event.wait_and_throw(); +} + +template +void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer row_ind, sycl::buffer col_ind, + sycl::buffer val) { + detail::check_can_reset_sparse_handle(__func__, smhandle, true); + auto event = queue.submit([&](sycl::handler &cgh) { + auto row_acc = row_ind.template get_access(cgh); + auto col_acc = col_ind.template get_access(cgh); + auto val_acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, + nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), + sc.get_mem(val_acc), cuda_index_type, cuda_index_base, + cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, + sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc)); + } + smhandle->row_container.set_buffer(row_ind); + smhandle->col_container.set_buffer(col_ind); + smhandle->value_container.set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template +void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType *row_ind, intType *col_ind, fpType *val) { + detail::check_can_reset_sparse_handle(__func__, smhandle, false); + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, + nnz, sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val), + cuda_index_type, cuda_index_base, cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, + sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val)); + } + smhandle->row_container.set_usm_ptr(row_ind); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); + }); + }); + event.wait_and_throw(); +} + +FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); + +// CSR matrix +template +void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer row_ptr, sycl::buffer col_ind, + sycl::buffer val) { + auto event = queue.submit([&](sycl::handler &cgh) { + auto row_acc = row_ptr.template get_access(cgh); + auto col_acc = col_ind.template get_access(cgh); + auto val_acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, + sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), + cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); + *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols, + nnz, index); + }); + }); + event.wait_and_throw(); +} + +template +void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType *row_ptr, intType *col_ind, fpType *val) { + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + cusparseSpMatDescr_t cu_smhandle; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, + sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val), + cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); + *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols, + nnz, index); + }); + }); + event.wait_and_throw(); +} + +template +void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + sycl::buffer row_ptr, sycl::buffer col_ind, + sycl::buffer val) { + detail::check_can_reset_sparse_handle(__func__, smhandle, true); + auto event = queue.submit([&](sycl::handler &cgh) { + auto row_acc = row_ptr.template get_access(cgh); + auto col_acc = col_ind.template get_access(cgh); + auto val_acc = val.template get_access(cgh); + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, + nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), + sc.get_mem(val_acc), cuda_index_type, cuda_index_type, + cuda_index_base, cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, + sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc)); + } + smhandle->row_container.set_buffer(row_ptr); + smhandle->col_container.set_buffer(col_ind); + smhandle->value_container.set_buffer(val); + }); + }); + event.wait_and_throw(); +} + +template +void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, + intType *row_ptr, intType *col_ind, fpType *val) { + detail::check_can_reset_sparse_handle(__func__, smhandle, false); + auto event = queue.submit([&](sycl::handler &cgh) { + submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + // Ensure that a cusparse handle is created before any other cuSPARSE function is called. + sc.get_handle(queue); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || + smhandle->nnz != nnz || smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, + nnz, sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val), + cuda_index_type, cuda_index_type, cuda_index_base, + cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, + sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val)); + } + smhandle->row_container.set_usm_ptr(row_ptr); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); + }); + }); + event.wait_and_throw(); +} + +FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); + +sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, + const std::vector &dependencies) { + // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used + auto functor = [=](CusparseScopedContextHandler &) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + delete smhandle; + }; + return dispatch_submit(__func__, queue, dependencies, functor, smhandle); +} + +// Matrix property +bool set_matrix_property(sycl::queue &, matrix_handle_t smhandle, matrix_property property) { + // No equivalent in cuSPARSE + // Store the matrix property internally for future usages + smhandle->set_matrix_property(property); + return false; +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp new file mode 100644 index 000000000..ac22d33ae --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp @@ -0,0 +1,78 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ + +#include + +#include "sparse_blas/generic_container.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of incomplete types dense_vector_handle, dense_matrix_handle and matrix_handle. + +struct dense_vector_handle : public detail::generic_dense_vector_handle { + template + dense_vector_handle(cusparseDnVecDescr_t cu_descr, T* value_ptr, std::int64_t size) + : detail::generic_dense_vector_handle(cu_descr, value_ptr, size) { + } + + template + dense_vector_handle(cusparseDnVecDescr_t cu_descr, const sycl::buffer value_buffer, + std::int64_t size) + : detail::generic_dense_vector_handle(cu_descr, value_buffer, + size) {} +}; + +struct dense_matrix_handle : public detail::generic_dense_matrix_handle { + template + dense_matrix_handle(cusparseDnMatDescr_t cu_descr, T* value_ptr, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout) + : detail::generic_dense_matrix_handle( + cu_descr, value_ptr, num_rows, num_cols, ld, dense_layout) {} + + template + dense_matrix_handle(cusparseDnMatDescr_t cu_descr, const sycl::buffer value_buffer, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, + layout dense_layout) + : detail::generic_dense_matrix_handle( + cu_descr, value_buffer, num_rows, num_cols, ld, dense_layout) {} +}; + +struct matrix_handle : public detail::generic_sparse_handle { + template + matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr, + fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, + oneapi::mkl::index_base index) + : detail::generic_sparse_handle( + cu_descr, row_ptr, col_ptr, value_ptr, num_rows, num_cols, nnz, index) {} + + template + matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer row_buffer, + const sycl::buffer col_buffer, + const sycl::buffer value_buffer, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) + : detail::generic_sparse_handle( + cu_descr, row_buffer, col_buffer, value_buffer, num_rows, num_cols, nnz, index) {} +}; + +} // namespace oneapi::mkl::sparse + +#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp new file mode 100644 index 000000000..b392071f5 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp @@ -0,0 +1,165 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ + +#include +#include +#include +#include + +#include + +#include "oneapi/mkl/sparse_blas/types.hpp" +#include "sparse_blas/enum_data_types.hpp" +#include "sparse_blas/sycl_helper.hpp" +#include "cusparse_error.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +template +struct CudaEnumType; +template <> +struct CudaEnumType { + static constexpr cudaDataType_t value = CUDA_R_32F; +}; +template <> +struct CudaEnumType { + static constexpr cudaDataType_t value = CUDA_R_64F; +}; +template <> +struct CudaEnumType> { + static constexpr cudaDataType_t value = CUDA_C_32F; +}; +template <> +struct CudaEnumType> { + static constexpr cudaDataType_t value = CUDA_C_64F; +}; + +template +struct CudaIndexEnumType; +template <> +struct CudaIndexEnumType { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_32I; +}; +template <> +struct CudaIndexEnumType { + static constexpr cusparseIndexType_t value = CUSPARSE_INDEX_64I; +}; + +template +inline std::string cast_enum_to_str(E e) { + return std::to_string(static_cast(e)); +} + +inline cudaDataType_t get_cuda_value_type(detail::data_type onemkl_data_type) { + switch (onemkl_data_type) { + case detail::data_type::real_fp32: return CUDA_R_32F; + case detail::data_type::real_fp64: return CUDA_R_64F; + case detail::data_type::complex_fp32: return CUDA_C_32F; + case detail::data_type::complex_fp64: return CUDA_C_64F; + default: + throw oneapi::mkl::invalid_argument( + "sparse_blas", "get_cuda_value_type", + "Invalid data type: " + cast_enum_to_str(onemkl_data_type)); + } +} + +inline cusparseOrder_t get_cuda_order(layout l) { + switch (l) { + case layout::row_major: return CUSPARSE_ORDER_ROW; + case layout::col_major: return CUSPARSE_ORDER_COL; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_order", + "Unknown layout: " + cast_enum_to_str(l)); + } +} + +inline cusparseIndexBase_t get_cuda_index_base(index_base index) { + switch (index) { + case index_base::zero: return CUSPARSE_INDEX_BASE_ZERO; + case index_base::one: return CUSPARSE_INDEX_BASE_ONE; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_index_base", + "Unknown index_base: " + cast_enum_to_str(index)); + } +} + +/// Return the CUDA transpose operation from a oneMKL type. +/// Do not conjugate for real types to avoid an invalid argument. +inline cusparseOperation_t get_cuda_operation(detail::data_type type, transpose op) { + switch (op) { + case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE; + case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE; + case transpose::conjtrans: + return (type == detail::data_type::complex_fp32 || + type == detail::data_type::complex_fp64) + ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE + : CUSPARSE_OPERATION_TRANSPOSE; + default: + throw oneapi::mkl::invalid_argument( + "sparse_blas", "get_cuda_operation", + "Unknown transpose operation: " + cast_enum_to_str(op)); + } +} + +inline auto get_cuda_uplo(uplo uplo_val) { + switch (uplo_val) { + case uplo::upper: return CUSPARSE_FILL_MODE_UPPER; + case uplo::lower: return CUSPARSE_FILL_MODE_LOWER; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_uplo", + "Unknown uplo: " + cast_enum_to_str(uplo_val)); + } +} + +inline auto get_cuda_diag(diag diag_val) { + switch (diag_val) { + case diag::nonunit: return CUSPARSE_DIAG_TYPE_NON_UNIT; + case diag::unit: return CUSPARSE_DIAG_TYPE_UNIT; + default: + throw oneapi::mkl::invalid_argument("sparse_blas", "get_cuda_diag", + "Unknown diag: " + cast_enum_to_str(diag_val)); + } +} + +inline void set_matrix_attributes(const std::string& func_name, cusparseSpMatDescr_t cu_a, + oneapi::mkl::sparse::matrix_view A_view) { + auto cu_fill_mode = get_cuda_uplo(A_view.uplo_view); + auto status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_FILL_MODE, &cu_fill_mode, + sizeof(cu_fill_mode)); + check_status(status, func_name + "/set_uplo"); + + auto cu_diag_type = get_cuda_diag(A_view.diag_view); + status = cusparseSpMatSetAttribute(cu_a, CUSPARSE_SPMAT_DIAG_TYPE, &cu_diag_type, + sizeof(cu_diag_type)); + check_status(status, func_name + "/set_diag"); +} + +/** + * cuSPARSE requires to set the pointer mode for scalars parameters (typically alpha and beta). + */ +inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_accessible) { + cusparseSetPointerMode(cu_handle, is_ptr_host_accessible ? CUSPARSE_POINTER_MODE_HOST + : CUSPARSE_POINTER_MODE_DEVICE); +} + +} // namespace oneapi::mkl::sparse::cusparse + +#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp new file mode 100644 index 000000000..c25c7c92f --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp @@ -0,0 +1,147 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +/** + * @file Similar to cublas_scope_handle.cpp +*/ + +#include "cusparse_scope_handle.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +/** + * Inserts a new element in the map if its key is unique. This new element + * is constructed in place using args as the arguments for the construction + * of a value_type (which is an object of a pair type). The insertion only + * takes place if no other element in the container has a key equivalent to + * the one being emplaced (keys in a map container are unique). + */ +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED +thread_local cusparse_global_handle + CusparseScopedContextHandler::handle_helper = cusparse_global_handle{}; +#else +thread_local cusparse_global_handle CusparseScopedContextHandler::handle_helper = + cusparse_global_handle{}; +#endif + +CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue, + sycl::interop_handle &ih) + : ih(ih), + needToRecover_(false) { + placedContext_ = new sycl::context(queue.get_context()); + auto cudaDevice = ih.get_native_device(); + CUcontext desired; + CUDA_ERROR_FUNC(cuCtxGetCurrent, &original_); + CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice); + if (original_ != desired) { + // Sets the desired context as the active one for the thread + CUDA_ERROR_FUNC(cuCtxSetCurrent, desired); + // No context is installed and the suggested context is primary + // This is the most common case. We can activate the context in the + // thread and leave it there until all the PI context referring to the + // same underlying CUDA primary context are destroyed. This emulates + // the behaviour of the CUDA runtime api, and avoids costly context + // switches. No action is required on this side of the if. + needToRecover_ = !(original_ == nullptr); + } +} + +CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) { + if (needToRecover_) { + CUDA_ERROR_FUNC(cuCtxSetCurrent, original_); + } + delete placedContext_; +} + +void ContextCallback(void *userData) { + auto *ptr = static_cast *>(userData); + if (!ptr) { + return; + } + auto handle = ptr->exchange(nullptr); + if (handle != nullptr) { + CUSPARSE_ERR_FUNC(cusparseDestroy, handle); + handle = nullptr; + } + else { + // if the handle is nullptr it means the handle was already destroyed by + // the cusparse_global_handle destructor and we're free to delete the atomic + // object. + delete ptr; + } +} + +std::pair CusparseScopedContextHandler::get_handle_and_stream( + const sycl::queue &queue) { + auto cudaDevice = ih.get_native_device(); + CUcontext desired; + CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice); +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED + auto piPlacedContext_ = reinterpret_cast(desired); +#else + auto piPlacedContext_ = reinterpret_cast(desired); +#endif + CUstream streamId = get_stream(queue); + auto it = handle_helper.cusparse_global_handle_mapper_.find(piPlacedContext_); + if (it != handle_helper.cusparse_global_handle_mapper_.end()) { + if (it->second == nullptr) { + handle_helper.cusparse_global_handle_mapper_.erase(it); + } + else { + auto handle = it->second->load(); + if (handle != nullptr) { + cudaStream_t currentStreamId; + CUSPARSE_ERR_FUNC(cusparseGetStream, handle, ¤tStreamId); + if (currentStreamId != streamId) { + CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId); + } + return { handle, streamId }; + } + else { + handle_helper.cusparse_global_handle_mapper_.erase(it); + } + } + } + + cusparseHandle_t handle; + CUSPARSE_ERR_FUNC(cusparseCreate, &handle); + CUSPARSE_ERR_FUNC(cusparseSetStream, handle, streamId); + + auto insert_iter = handle_helper.cusparse_global_handle_mapper_.insert( + std::make_pair(piPlacedContext_, new std::atomic(handle))); + + sycl::detail::pi::contextSetExtendedDeleter(*placedContext_, ContextCallback, + insert_iter.first->second); + + return { handle, streamId }; +} + +cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue &queue) { + return get_handle_and_stream(queue).first; +} + +CUstream CusparseScopedContextHandler::get_stream(const sycl::queue &queue) { + return sycl::get_native(queue); +} + +sycl::context CusparseScopedContextHandler::get_context(const sycl::queue &queue) { + return queue.get_context(); +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp new file mode 100644 index 000000000..b56bb07cf --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -0,0 +1,93 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ + +/** + * @file Similar to cublas_scope_handle.hpp +*/ + +#if __has_include() +#include +#else +#include +#endif + +// After Plugin Interface removal in DPC++ ur.hpp is the new include +#if __has_include() && !defined(ONEAPI_ONEMKL_PI_INTERFACE_REMOVED) +#define ONEAPI_ONEMKL_PI_INTERFACE_REMOVED +#endif + +#include + +#include "cusparse_error.hpp" +#include "cusparse_global_handle.hpp" +#include "cusparse_helper.hpp" + +namespace oneapi::mkl::sparse::cusparse { + +class CusparseScopedContextHandler { + CUcontext original_; + sycl::context *placedContext_; + sycl::interop_handle &ih; + bool needToRecover_; + +#ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED + static thread_local cusparse_global_handle handle_helper; +#else + static thread_local cusparse_global_handle handle_helper; +#endif + + CUstream get_stream(const sycl::queue &queue); + sycl::context get_context(const sycl::queue &queue); + +public: + CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + + ~CusparseScopedContextHandler() noexcept(false); + + /** + * @brief get_handle: creates the handle by implicitly impose the advice + * given by nvidia for creating a cusparse_global_handle. (e.g. one cuStream per device + * per thread). + * @param queue sycl queue. + * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream + */ + std::pair get_handle_and_stream(const sycl::queue &queue); + + /// See get_handle_and_stream + cusparseHandle_t get_handle(const sycl::queue &queue); + + // This is a work-around function for reinterpret_casting the memory. This + // will be fixed when SYCL-2020 has been implemented for Pi backend. + template + inline void *get_mem(AccT acc) { + auto cudaPtr = ih.get_native_mem(acc); + return reinterpret_cast(cudaPtr); + } + + template + inline void *get_mem(T *ptr) { + return reinterpret_cast(ptr); + } +}; + +} // namespace oneapi::mkl::sparse::cusparse + +#endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp new file mode 100644 index 000000000..e839c5100 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -0,0 +1,382 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ +#define _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ + +#include "cusparse_handles.hpp" +#include "cusparse_scope_handle.hpp" + +/// This file provide a helper function to submit host_task using buffers or USM seamlessly + +namespace oneapi::mkl::sparse::cusparse { + +template +auto get_value_accessor(sycl::handler &cgh, Container container) { + auto buffer_ptr = + reinterpret_cast *>(container->value_container.buffer_ptr.get()); + return buffer_ptr->template get_access(cgh); +} + +template +auto get_fp_accessors(sycl::handler &cgh, Ts... containers) { + return std::array, sizeof...(containers)>{ get_value_accessor( + cgh, containers)... }; +} + +template +auto get_row_accessor(sycl::handler &cgh, matrix_handle_t smhandle) { + auto buffer_ptr = + reinterpret_cast *>(smhandle->row_container.buffer_ptr.get()); + return buffer_ptr->template get_access(cgh); +} + +template +auto get_col_accessor(sycl::handler &cgh, matrix_handle_t smhandle) { + auto buffer_ptr = + reinterpret_cast *>(smhandle->col_container.buffer_ptr.get()); + return buffer_ptr->template get_access(cgh); +} + +template +auto get_int_accessors(sycl::handler &cgh, matrix_handle_t smhandle) { + return std::array, 2>{ get_row_accessor(cgh, smhandle), + get_col_accessor(cgh, smhandle) }; +} + +template +void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly handled + // The accessors's pointer have already been set to the native container types in previous functions + cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + auto sc = CusparseScopedContextHandler(queue, ih); + functor(sc); + }); +} + +template +void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor, + sycl::accessor workspace_placeholder_acc, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly handled + // The accessors's pointer have already been set to the native container types in previous functions + cgh.require(workspace_placeholder_acc); + cgh.host_task([functor, queue, workspace_placeholder_acc, + capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + auto sc = CusparseScopedContextHandler(queue, ih); + functor(sc, workspace_placeholder_acc); + }); +} + +template +void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor functor, + const std::vector &dependencies, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly handled + // The accessors's pointer have already been set to the native container types in previous functions +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + cgh.ext_codeplay_enqueue_native_command( + [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + auto sc = CusparseScopedContextHandler(queue, ih); + // The functor using ext_codeplay_enqueue_native_command need to + // explicitly wait on the events for the SPARSE domain. The + // extension ext_codeplay_enqueue_native_command is used to launch + // the compute operation which depends on the previous optimize + // step. In cuSPARSE the optimize step is synchronous but it is + // asynchronous in oneMKL Interface. The optimize step may not use + // the CUDA stream which would make it impossible for + // ext_codeplay_enqueue_native_command to automatically ensure it + // has completed before the compute function starts. These waits are + // used to ensure the optimize step has completed before starting + // the computation. + for (auto event : dependencies) { + event.wait(); + } + functor(sc); + }); +#else + (void)dependencies; + submit_host_task(cgh, queue, functor, capture_only_accessors...); +#endif +} + +template +void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor, + const std::vector &dependencies, + sycl::accessor workspace_placeholder_acc, + CaptureOnlyAcc... capture_only_accessors) { + // Only capture the accessors to ensure the dependencies are properly handled + // The accessors's pointer have already been set to the native container types in previous functions +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + cgh.require(workspace_placeholder_acc); + cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, + workspace_placeholder_acc, + capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + auto sc = CusparseScopedContextHandler(queue, ih); + // The functor using ext_codeplay_enqueue_native_command need to + // explicitly wait on the events for the SPARSE domain. The + // extension ext_codeplay_enqueue_native_command is used to launch + // the compute operation which depends on the previous optimize + // step. In cuSPARSE the optimize step is synchronous but it is + // asynchronous in oneMKL Interface. The optimize step may not use + // the CUDA stream which would make it impossible for + // ext_codeplay_enqueue_native_command to automatically ensure it + // has completed before the compute function starts. These waits are + // used to ensure the optimize step has completed before starting + // the computation. + for (auto event : dependencies) { + event.wait(); + } + functor(sc, workspace_placeholder_acc); + }); +#else + (void)dependencies; + submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc, + capture_only_accessors...); +#endif +} + +/// Helper submit functions to capture all accessors from the generic containers +/// \p other_containers and ensure the dependencies of buffers are respected. +/// The accessors are not directly used as the underlying data pointer has +/// already been captured in previous functions. +/// \p workspace_placeholder_acc is a placeholder accessor that will be bound to +/// the cgh if not empty and given to the functor as a last argument. +/// \p UseWorkspace must be true to use the placeholder accessor. +/// \p UseEnqueueNativeCommandExt controls whether host_task are used or the +/// extension ext_codeplay_enqueue_native_command is used to launch tasks. The +/// extension should only be used for asynchronous functions using native +/// backend's functions. +template +sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl::queue queue, + const std::vector &dependencies, + Functor functor, matrix_handle_t sm_handle, + sycl::accessor workspace_placeholder_acc, + Ts... other_containers) { + if (sm_handle->all_use_buffer()) { + detail::data_type value_type = sm_handle->get_value_type(); + detail::data_type int_type = sm_handle->get_int_type(); + +#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ + return queue.submit([&](sycl::handler &cgh) { \ + cgh.depends_on(dependencies); \ + auto fp_accs = get_fp_accessors(cgh, sm_handle, other_containers...); \ + auto int_accs = get_int_accessors(cgh, sm_handle); \ + if constexpr (UseWorkspace) { \ + if constexpr (UseEnqueueNativeCommandExt) { \ + submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ + workspace_placeholder_acc, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc, fp_accs, \ + int_accs); \ + } \ + } \ + else { \ + (void)workspace_placeholder_acc; \ + if constexpr (UseEnqueueNativeCommandExt) { \ + submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ + } \ + } \ + }) +#define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE) \ + if (int_type == detail::data_type::int32) { \ + ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \ + } \ + else if (int_type == detail::data_type::int64) { \ + ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \ + } + + if (value_type == detail::data_type::real_fp32) { + ONEMKL_CUSPARSE_SUBMIT_INT(float) + } + else if (value_type == detail::data_type::real_fp64) { + ONEMKL_CUSPARSE_SUBMIT_INT(double) + } + else if (value_type == detail::data_type::complex_fp32) { + ONEMKL_CUSPARSE_SUBMIT_INT(std::complex) + } + else if (value_type == detail::data_type::complex_fp64) { + ONEMKL_CUSPARSE_SUBMIT_INT(std::complex) + } + +#undef ONEMKL_CUSPARSE_SUBMIT_INT +#undef ONEMKL_CUSPARSE_SUBMIT + + throw oneapi::mkl::exception("sparse_blas", function_name, + "Could not dispatch buffer kernel to a supported type"); + } + else { + // USM submit does not need to capture accessors + if constexpr (!UseWorkspace) { + return queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + if constexpr (UseEnqueueNativeCommandExt) { + submit_native_command_ext(cgh, queue, functor, dependencies); + } + else { + submit_host_task(cgh, queue, functor); + } + }); + } + else { + throw oneapi::mkl::exception("sparse_blas", function_name, + "Internal error: Cannot use accessor workspace with USM"); + } + } +} + +/// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type. +template +sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queue queue, + const std::vector &dependencies, Functor functor, + ContainerT container_handle) { + if (container_handle->all_use_buffer()) { + detail::data_type value_type = container_handle->get_value_type(); + +#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE) \ + return queue.submit([&](sycl::handler &cgh) { \ + cgh.depends_on(dependencies); \ + auto fp_accs = get_fp_accessors(cgh, container_handle); \ + submit_host_task(cgh, queue, functor, fp_accs); \ + }) + + if (value_type == detail::data_type::real_fp32) { + ONEMKL_CUSPARSE_SUBMIT(float); + } + else if (value_type == detail::data_type::real_fp64) { + ONEMKL_CUSPARSE_SUBMIT(double); + } + else if (value_type == detail::data_type::complex_fp32) { + ONEMKL_CUSPARSE_SUBMIT(std::complex); + } + else if (value_type == detail::data_type::complex_fp64) { + ONEMKL_CUSPARSE_SUBMIT(std::complex); + } + +#undef ONEMKL_CUSPARSE_SUBMIT + + throw oneapi::mkl::exception("sparse_blas", function_name, + "Could not dispatch buffer kernel to a supported type"); + } + else { + return queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + submit_host_task(cgh, queue, functor); + }); + } +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor, + matrix_handle_t sm_handle, + sycl::accessor workspace_placeholder_acc, + Ts... other_containers) { + constexpr bool UseWorkspace = true; + constexpr bool UseEnqueueNativeCommandExt = false; + return dispatch_submit_impl_fp_int( + function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc, + other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, + const std::vector &dependencies, Functor functor, + matrix_handle_t sm_handle, Ts... other_containers) { + constexpr bool UseWorkspace = false; + constexpr bool UseEnqueueNativeCommandExt = false; + return dispatch_submit_impl_fp_int( + function_name, queue, dependencies, functor, sm_handle, {}, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor, + matrix_handle_t sm_handle, Ts... other_containers) { + constexpr bool UseWorkspace = false; + constexpr bool UseEnqueueNativeCommandExt = false; + return dispatch_submit_impl_fp_int( + function_name, queue, {}, functor, sm_handle, {}, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, + Functor functor, matrix_handle_t sm_handle, + sycl::accessor workspace_placeholder_acc, + Ts... other_containers) { + constexpr bool UseWorkspace = true; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + return dispatch_submit_impl_fp_int( + function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc, + other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, + const std::vector &dependencies, + Functor functor, matrix_handle_t sm_handle, + Ts... other_containers) { + constexpr bool UseWorkspace = false; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + return dispatch_submit_impl_fp_int( + function_name, queue, dependencies, functor, sm_handle, {}, other_containers...); +} + +/// Helper function for dispatch_submit_impl_fp_int +template +sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, + Functor functor, matrix_handle_t sm_handle, + Ts... other_containers) { + constexpr bool UseWorkspace = false; +#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + constexpr bool UseEnqueueNativeCommandExt = true; +#else + constexpr bool UseEnqueueNativeCommandExt = false; +#endif + return dispatch_submit_impl_fp_int( + function_name, queue, {}, functor, sm_handle, {}, other_containers...); +} + +} // namespace oneapi::mkl::sparse::cusparse + +#endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp new file mode 100644 index 000000000..278aec296 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/cusparse_wrappers.cpp @@ -0,0 +1,32 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/types.hpp" + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/function_table.hpp" + +#define WRAPPER_VERSION 1 +#define BACKEND cusparse + +extern "C" sparse_blas_function_table_t mkl_sparse_blas_table = { + WRAPPER_VERSION, +#include "sparse_blas/backends/backend_wrappers.cxx" +}; diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp new file mode 100644 index 000000000..09fe0515e --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -0,0 +1,296 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spmm_descr { + detail::generic_container workspace; + std::size_t temp_buffer_size = 0; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + oneapi::mkl::transpose last_optimized_opB; + oneapi::mkl::sparse::matrix_view last_optimized_A_view; + oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; + oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle; + oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle; + oneapi::mkl::sparse::spmm_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { + *p_spmm_descr = new spmm_descr(); +} + +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies) { + return detail::submit_release(queue, spmm_descr, dependencies); +} + +inline auto get_cuda_spmm_alg(spmm_alg alg) { + switch (alg) { + case spmm_alg::coo_alg1: return CUSPARSE_SPMM_COO_ALG1; + case spmm_alg::coo_alg2: return CUSPARSE_SPMM_COO_ALG2; + case spmm_alg::coo_alg3: return CUSPARSE_SPMM_COO_ALG3; + case spmm_alg::coo_alg4: return CUSPARSE_SPMM_COO_ALG4; + case spmm_alg::csr_alg1: return CUSPARSE_SPMM_CSR_ALG1; + case spmm_alg::csr_alg2: return CUSPARSE_SPMM_CSR_ALG2; + case spmm_alg::csr_alg3: return CUSPARSE_SPMM_CSR_ALG3; + default: return CUSPARSE_SPMM_ALG_DEFAULT; + } +} + +inline void fallback_alg_if_needed(oneapi::mkl::sparse::spmm_alg& alg, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB) { + if (alg == oneapi::mkl::sparse::spmm_alg::csr_alg3 && + (opA != oneapi::mkl::transpose::nontrans || opB == oneapi::mkl::transpose::conjtrans)) { + // Avoid warnings printed on std::cerr + alg = oneapi::mkl::sparse::spmm_alg::default_alg; + } +} + +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + oneapi::mkl::sparse::spmm_alg alg, + oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); + fallback_alg_if_needed(alg, opA, opB); + auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler& sc) { + auto cu_handle = sc.get_handle(queue); + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = get_cuda_operation(type, opA); + auto cu_op_b = get_cuda_operation(type, opB); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmm_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, + cu_c, cu_type, cu_alg, &temp_buffer_size); + check_status(status, __func__); + }; + auto event = dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle); + event.wait_and_throw(); + spmm_descr->temp_buffer_size = temp_buffer_size; + spmm_descr->buffer_size_called = true; +} + +inline void common_spmm_optimize( + oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, bool is_alpha_host_accessible, + oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, bool is_beta_host_accessible, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, + oneapi::mkl::sparse::spmm_descr_t spmm_descr) { + detail::check_valid_spmm_common("spmm_optimize", A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (!spmm_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spmm_optimize", + "spmm_buffer_size must be called before spmm_optimize."); + } + spmm_descr->optimized_called = true; + spmm_descr->last_optimized_opA = opA; + spmm_descr->last_optimized_opB = opB; + spmm_descr->last_optimized_A_view = A_view; + spmm_descr->last_optimized_A_handle = A_handle; + spmm_descr->last_optimized_B_handle = B_handle; + spmm_descr->last_optimized_C_handle = C_handle; + spmm_descr->last_optimized_alg = alg; +} + +void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + oneapi::mkl::sparse::spmm_alg alg, void* workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = get_cuda_operation(type, opA); + auto cu_op_b = get_cuda_operation(type, opB); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmm_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, + cu_c, cu_type, cu_alg, workspace_ptr); + check_status(status, "optimize_spmm"); +} + +void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + sycl::buffer workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spmm_descr->workspace.set_buffer_untyped(workspace); + if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace.size() == 0) { + // cusparseSpMM_preprocess cannot be called if the workspace is empty + return; + } + fallback_alg_if_needed(alg, opA, opB); + auto functor = [=](CusparseScopedContextHandler& sc, + sycl::accessor workspace_acc) { + auto cu_handle = sc.get_handle(queue); + auto workspace_ptr = sc.get_mem(workspace_acc); + spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, + workspace_ptr, is_alpha_host_accessible); + }; + + sycl::accessor workspace_placeholder_acc(workspace); + dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, B_handle, + C_handle); +} + +sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + oneapi::mkl::sparse::spmm_alg alg, + oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); + spmm_descr->workspace.usm_ptr = workspace; + if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace == nullptr) { + // cusparseSpMM_preprocess cannot be called if the workspace is empty + return detail::collapse_dependencies(queue, dependencies); + } + fallback_alg_if_needed(alg, opA, opB); + auto functor = [=](CusparseScopedContextHandler& sc) { + auto cu_handle = sc.get_handle(queue); + spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, + workspace, is_alpha_host_accessible); + }; + + return dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, C_handle); +} + +sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const std::vector& dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spmm_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spmm_optimize must be called before spmm."); + } + CHECK_DESCR_MATCH(spmm_descr, opA, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, opB, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, A_view, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, A_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, B_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize"); + CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize"); + + fallback_alg_if_needed(alg, opA, opB); + auto compute_functor = [=](CusparseScopedContextHandler& sc, void* workspace_ptr) { + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = get_cuda_operation(type, opA); + auto cu_op_b = get_cuda_operation(type, opB); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmm_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c, + cu_type, cu_alg, workspace_ptr); + check_status(status, __func__); +#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); +#endif + }; + if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { + // The accessor can only be bound to the cgh if the buffer size is + // greater than 0 + auto functor_buffer = [=](CusparseScopedContextHandler& sc, + sycl::accessor workspace_acc) { + auto workspace_ptr = sc.get_mem(workspace_acc); + compute_functor(sc, workspace_ptr); + }; + sycl::accessor workspace_placeholder_acc( + spmm_descr->workspace.get_buffer()); + return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + workspace_placeholder_acc, B_handle, C_handle); + } + else { + // The same dispatch_submit can be used for USM or buffers if no + // workspace accessor is needed, workspace_ptr will be a nullptr in the + // latter case. + auto workspace_ptr = spmm_descr->workspace.usm_ptr; + auto functor_usm = [=](CusparseScopedContextHandler& sc) { + compute_functor(sc, workspace_ptr); + }; + return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, + B_handle, C_handle); + } +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp new file mode 100644 index 000000000..e06f84695 --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -0,0 +1,323 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spmv_descr { + detail::generic_container workspace; + std::size_t temp_buffer_size = 0; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + oneapi::mkl::sparse::matrix_view last_optimized_A_view; + oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; + oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; + oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; + oneapi::mkl::sparse::spmv_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) { + *p_spmv_descr = new spmv_descr(); +} + +sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, + const std::vector &dependencies) { + return detail::submit_release(queue, spmv_descr, dependencies); +} + +inline auto get_cuda_spmv_alg(spmv_alg alg) { + switch (alg) { + case spmv_alg::coo_alg1: return CUSPARSE_SPMV_COO_ALG1; + case spmv_alg::coo_alg2: return CUSPARSE_SPMV_COO_ALG2; + case spmv_alg::csr_alg1: return CUSPARSE_SPMV_CSR_ALG1; + case spmv_alg::csr_alg2: return CUSPARSE_SPMV_CSR_ALG2; + default: return CUSPARSE_SPMV_ALG_DEFAULT; + } +} + +void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { + detail::check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmv with a `type_view` other than `matrix_descr::general`."); + } +} + +void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, + oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, + is_beta_host_accessible); + auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmv_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, + cu_type, cu_alg, &temp_buffer_size); + check_status(status, __func__); + }; + auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + event.wait_and_throw(); + spmv_descr->temp_buffer_size = temp_buffer_size; + spmv_descr->buffer_size_called = true; +} + +inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + bool is_beta_host_accessible, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, + oneapi::mkl::sparse::spmv_descr_t spmv_descr) { + check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (!spmv_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spmv_optimize", + "spmv_buffer_size must be called before spmv_optimize."); + } + spmv_descr->optimized_called = true; + spmv_descr->last_optimized_opA = opA; + spmv_descr->last_optimized_A_view = A_view; + spmv_descr->last_optimized_A_handle = A_handle; + spmv_descr->last_optimized_x_handle = x_handle; + spmv_descr->last_optimized_y_handle = y_handle; + spmv_descr->last_optimized_alg = alg; +} + +#if CUSPARSE_VERSION >= 12300 +// cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) +void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, void *workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmv_alg(alg); + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, + cu_alg, workspace_ptr); + check_status(status, "optimize_spmv"); +} +#endif + +void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, + sycl::buffer workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spmv_descr->workspace.set_buffer_untyped(workspace); + if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + return; + } + +#if CUSPARSE_VERSION < 12300 + // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) + return; +#else + if (spmv_descr->temp_buffer_size > 0) { + auto functor = [=](CusparseScopedContextHandler &sc, + sycl::accessor workspace_acc) { + auto cu_handle = sc.get_handle(queue); + auto workspace_ptr = sc.get_mem(workspace_acc); + spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, + workspace_ptr, is_alpha_host_accessible); + }; + + // The accessor can only be bound to the cgh if the buffer size is + // greater than 0 + sycl::accessor workspace_placeholder_acc(workspace); + dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle, + y_handle); + } + else { + auto functor = [=](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, + nullptr, is_alpha_host_accessible); + }; + dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + } +#endif +} + +sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, + oneapi::mkl::sparse::spmv_descr_t spmv_descr, void *workspace, + const std::vector &dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); + spmv_descr->workspace.usm_ptr = workspace; + if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + return detail::collapse_dependencies(queue, dependencies); + } + +#if CUSPARSE_VERSION < 12300 + // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) + return detail::collapse_dependencies(queue, dependencies); +#else + auto functor = [=](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, + workspace, is_alpha_host_accessible); + }; + return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); +#endif +} + +sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, + const std::vector &dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, + is_beta_host_accessible); + if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spmv_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spmv_optimize must be called before spmv."); + } + CHECK_DESCR_MATCH(spmv_descr, opA, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, A_view, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, A_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, x_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize"); + CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize"); + + auto compute_functor = [=](CusparseScopedContextHandler &sc, void *workspace_ptr) { + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spmv_alg(alg); + // Workaround issue with captured alpha and beta causing a segfault inside cuSPARSE + // Copy alpha and beta locally in the largest data value type and use the local pointer + cuDoubleComplex local_alpha, local_beta; + const void *alpha_ptr = alpha, *beta_ptr = beta; + if (is_alpha_host_accessible) { + local_alpha = *reinterpret_cast(alpha_ptr); + local_beta = *reinterpret_cast(beta_ptr); + alpha_ptr = &local_alpha; + beta_ptr = &local_beta; + } + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV(cu_handle, cu_op, alpha_ptr, cu_a, cu_x, beta_ptr, cu_y, cu_type, + cu_alg, workspace_ptr); + check_status(status, __func__); +#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); +#endif + }; + if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { + // The accessor can only be bound to the cgh if the buffer size is + // greater than 0 + auto functor_buffer = [=](CusparseScopedContextHandler &sc, + sycl::accessor workspace_acc) { + auto workspace_ptr = sc.get_mem(workspace_acc); + compute_functor(sc, workspace_ptr); + }; + sycl::accessor workspace_placeholder_acc( + spmv_descr->workspace.get_buffer()); + return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + workspace_placeholder_acc, x_handle, y_handle); + } + else { + // The same dispatch_submit can be used for USM or buffers if no + // workspace accessor is needed, workspace_ptr will be a nullptr in the + // latter case. + auto workspace_ptr = spmv_descr->workspace.usm_ptr; + auto functor_usm = [=](CusparseScopedContextHandler &sc) { + compute_functor(sc, workspace_ptr); + }; + return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, + x_handle, y_handle); + } +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp new file mode 100644 index 000000000..2f124caad --- /dev/null +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -0,0 +1,263 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#include "oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp" + +#include "sparse_blas/backends/cusparse/cusparse_error.hpp" +#include "sparse_blas/backends/cusparse/cusparse_helper.hpp" +#include "sparse_blas/backends/cusparse/cusparse_task.hpp" +#include "sparse_blas/backends/cusparse/cusparse_handles.hpp" +#include "sparse_blas/common_op_verification.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" + +namespace oneapi::mkl::sparse { + +// Complete the definition of the incomplete type +struct spsv_descr { + cusparseSpSVDescr_t cu_descr; + detail::generic_container workspace; + bool buffer_size_called = false; + bool optimized_called = false; + oneapi::mkl::transpose last_optimized_opA; + oneapi::mkl::sparse::matrix_view last_optimized_A_view; + oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; + oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; + oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; + oneapi::mkl::sparse::spsv_alg last_optimized_alg; +}; + +} // namespace oneapi::mkl::sparse + +namespace oneapi::mkl::sparse::cusparse { + +void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { + *p_spsv_descr = new spsv_descr(); + CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr); +} + +sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, + const std::vector &dependencies) { + // Use dispatch_submit to ensure the backend's descriptor is kept alive as long as the buffers are used + auto functor = [=](CusparseScopedContextHandler &) { + CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr); + delete spsv_descr; + }; + return dispatch_submit(__func__, queue, dependencies, functor, + spsv_descr->last_optimized_A_handle, spsv_descr->last_optimized_x_handle, + spsv_descr->last_optimized_y_handle); +} + +inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { + return CUSPARSE_SPSV_ALG_DEFAULT; +} + +void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, + oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, + cu_alg, cu_descr, &temp_buffer_size); + check_status(status, __func__); + }; + auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + event.wait_and_throw(); + spsv_descr->buffer_size_called = true; +} + +inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, + oneapi::mkl::sparse::spsv_descr_t spsv_descr) { + detail::check_valid_spsv_common("spsv_optimize", A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (!spsv_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spsv_optimize", + "spsv_buffer_size must be called before spsv_optimize."); + } + spsv_descr->optimized_called = true; + spsv_descr->last_optimized_opA = opA; + spsv_descr->last_optimized_A_view = A_view; + spsv_descr->last_optimized_A_handle = A_handle; + spsv_descr->last_optimized_x_handle = x_handle; + spsv_descr->last_optimized_y_handle = y_handle; + spsv_descr->last_optimized_alg = alg; +} + +void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, + oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + set_matrix_attributes("optimize_spsv", cu_a, A_view); + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, + cu_descr, workspace_ptr); + check_status(status, "optimize_spsv"); +} + +void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + sycl::buffer workspace) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + if (!A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE + // Copy the buffer to extend its lifetime until the descriptor is free'd. + spsv_descr->workspace.set_buffer_untyped(workspace); + + if (workspace.size() > 0) { + auto functor = [=](CusparseScopedContextHandler &sc, + sycl::accessor workspace_acc) { + auto cu_handle = sc.get_handle(queue); + auto workspace_ptr = sc.get_mem(workspace_acc); + spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr, workspace_ptr, is_alpha_host_accessible); + }; + + // The accessor can only be bound to the cgh if the buffer size is + // greater than 0 + sycl::accessor workspace_placeholder_acc(workspace); + dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle, + y_handle); + } + else { + auto functor = [=](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr, nullptr, is_alpha_host_accessible); + }; + + dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + } +} + +sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, + oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace, + const std::vector &dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + if (A_handle->all_use_buffer()) { + detail::throw_incompatible_container(__func__); + } + common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE + auto functor = [=](CusparseScopedContextHandler &sc) { + auto cu_handle = sc.get_handle(queue); + spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr, workspace, is_alpha_host_accessible); + }; + // No need to store the workspace USM pointer as the backend stores it already + return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); +} + +sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + oneapi::mkl::sparse::matrix_view A_view, + oneapi::mkl::sparse::matrix_handle_t A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + const std::vector &dependencies) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { + detail::throw_incompatible_container(__func__); + } + + if (!spsv_descr->optimized_called) { + throw mkl::uninitialized("sparse_blas", __func__, + "spsv_optimize must be called before spsv."); + } + CHECK_DESCR_MATCH(spsv_descr, opA, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, A_view, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, A_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, x_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize"); + CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize"); + + auto functor = [=](CusparseScopedContextHandler &sc) { + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, + cu_descr); + check_status(status, __func__); +#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); +#endif + }; + return dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, x_handle, + y_handle); +} + +} // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp new file mode 100644 index 000000000..28c628438 --- /dev/null +++ b/src/sparse_blas/backends/mkl_common/mkl_dispatch.hpp @@ -0,0 +1,37 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ + +/// Convert \p value_type to template type argument and use it to call \p op_functor. +#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...) \ + switch (value_type) { \ + case detail::data_type::real_fp32: return op_functor(__VA_ARGS__); \ + case detail::data_type::real_fp64: return op_functor(__VA_ARGS__); \ + case detail::data_type::complex_fp32: return op_functor>(__VA_ARGS__); \ + case detail::data_type::complex_fp64: \ + return op_functor>(__VA_ARGS__); \ + default: \ + throw oneapi::mkl::exception( \ + "sparse_blas", function_name, \ + "Internal error: unsupported type " + data_type_to_str(value_type)); \ + } + +#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_DISPATCH_HPP_ diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx index 3ae84ca64..7550625eb 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx @@ -32,27 +32,11 @@ void init_dense_vector(sycl::queue & /*queue*/, *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size); } -template -void check_can_reset_value_handle(const std::string &function_name, - InternalHandleT *internal_handle, bool expect_buffer) { - if (internal_handle->get_value_type() != detail::get_data_type()) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, - "Incompatible data types expected " + - data_type_to_str(internal_handle->get_value_type()) + " but got " + - data_type_to_str(detail::get_data_type())); - } - if (internal_handle->all_use_buffer() != expect_buffer) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, "Cannot change the container type between buffer or USM"); - } -} - template void set_dense_vector_data(sycl::queue & /*queue*/, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, sycl::buffer val) { - check_can_reset_value_handle(__func__, dvhandle, true); + detail::check_can_reset_value_handle(__func__, dvhandle, true); dvhandle->size = size; dvhandle->set_buffer(val); } @@ -61,26 +45,12 @@ template void set_dense_vector_data(sycl::queue & /*queue*/, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, fpType *val) { - check_can_reset_value_handle(__func__, dvhandle, false); + detail::check_can_reset_value_handle(__func__, dvhandle, false); dvhandle->size = size; dvhandle->set_usm_ptr(val); } -#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX) \ - template void init_dense_vector( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, sycl::buffer val); \ - template void init_dense_vector( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, FP_TYPE * val); \ - template void set_dense_vector_data( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, sycl::buffer val); \ - template void set_dense_vector_data( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, FP_TYPE * val) FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); -#undef INSTANTIATE_DENSE_VECTOR_FUNCS sycl::event release_dense_vector(sycl::queue &queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, @@ -112,7 +82,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { - check_can_reset_value_handle(__func__, dmhandle, true); + detail::check_can_reset_value_handle(__func__, dmhandle, true); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -125,7 +95,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType *val) { - check_can_reset_value_handle(__func__, dmhandle, false); + detail::check_can_reset_value_handle(__func__, dmhandle, false); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -133,25 +103,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, dmhandle->set_usm_ptr(val); } -#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ - template void init_dense_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, sycl::buffer val); \ - template void init_dense_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val); \ - template void set_dense_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, sycl::buffer val); \ - template void set_dense_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val) FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); -#undef INSTANTIATE_DENSE_MATRIX_FUNCS sycl::event release_dense_matrix(sycl::queue &queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, @@ -167,7 +119,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + num_rows, num_cols, nnz, index); // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), @@ -184,7 +137,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p fpType *val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + num_rows, num_cols, nnz, index); auto event = oneapi::mkl::sparse::set_coo_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), index, row_ind, col_ind, val); @@ -192,32 +146,17 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p *p_smhandle = reinterpret_cast(internal_smhandle); } -template -void check_can_reset_sparse_handle(const std::string &function_name, - detail::sparse_matrix_handle *internal_smhandle, - bool expect_buffer) { - check_can_reset_value_handle(function_name, internal_smhandle, expect_buffer); - if (internal_smhandle->get_int_type() != detail::get_data_type()) { - throw oneapi::mkl::invalid_argument( - "sparse_blas", function_name, - "Incompatible data types expected " + - data_type_to_str(internal_smhandle->get_int_type()) + " but got " + - data_type_to_str(detail::get_data_type())); - } - if (!internal_smhandle->can_be_reset) { - throw mkl::unimplemented( - "sparse_blas/mkl", function_name, - "Reseting the matrix handle's data after it was used in a computation is not supported."); - } -} - template void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle(__func__, internal_smhandle, true); + detail::check_can_reset_sparse_handle(__func__, internal_smhandle, true); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_buffer(row_ind); internal_smhandle->col_container.set_buffer(col_ind); internal_smhandle->value_container.set_buffer(val); @@ -236,7 +175,11 @@ void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, fpType *val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle(__func__, internal_smhandle, false); + detail::check_can_reset_sparse_handle(__func__, internal_smhandle, false); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_usm_ptr(row_ind); internal_smhandle->col_container.set_usm_ptr(col_ind); internal_smhandle->value_container.set_usm_ptr(val); @@ -246,37 +189,18 @@ void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ event.wait_and_throw(); } -#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ - template void init_coo_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, sycl::buffer row_ind, \ - sycl::buffer col_ind, sycl::buffer val); \ - template void init_coo_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val); \ - template void set_coo_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - sycl::buffer row_ind, sycl::buffer col_ind, \ - sycl::buffer val); \ - template void set_coo_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val) FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); -#undef INSTANTIATE_COO_MATRIX_FUNCS // CSR matrix template void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + num_rows, num_cols, nnz, index); // The backend deduces nnz from row_ptr. // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast(num_rows), @@ -289,12 +213,13 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p template void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, fpType *val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + num_rows, num_cols, nnz, index); // The backend deduces nnz from row_ptr. auto event = oneapi::mkl::sparse::set_csr_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), index, @@ -305,11 +230,15 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p template void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle(__func__, internal_smhandle, true); + detail::check_can_reset_sparse_handle(__func__, internal_smhandle, true); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_buffer(row_ptr); internal_smhandle->col_container.set_buffer(col_ind); internal_smhandle->value_container.set_buffer(val); @@ -325,11 +254,15 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ template void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t /*nnz*/, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, fpType *val) { auto internal_smhandle = detail::get_internal_handle(smhandle); - check_can_reset_sparse_handle(__func__, internal_smhandle, false); + detail::check_can_reset_sparse_handle(__func__, internal_smhandle, false); + internal_smhandle->num_rows = num_rows; + internal_smhandle->num_cols = num_cols; + internal_smhandle->nnz = nnz; + internal_smhandle->index = index; internal_smhandle->row_container.set_usm_ptr(row_ptr); internal_smhandle->col_container.set_usm_ptr(col_ind); internal_smhandle->value_container.set_usm_ptr(val); @@ -340,27 +273,7 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ event.wait_and_throw(); } -#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ - template void init_csr_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, sycl::buffer row_ptr, \ - sycl::buffer col_ind, sycl::buffer val); \ - template void init_csr_matrix( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ - std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val); \ - template void set_csr_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - sycl::buffer row_ptr, sycl::buffer col_ind, \ - sycl::buffer val); \ - template void set_csr_matrix_data( \ - sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val) FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); -#undef INSTANTIATE_CSR_MATRIX_FUNCS // Common sparse matrix functions sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, @@ -369,7 +282,7 @@ sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matri // Asynchronously release the backend's handle followed by the internal handle. auto event = oneapi::mkl::sparse::release_matrix_handle( queue, &internal_smhandle->backend_handle, dependencies); - return detail::submit_release(queue, internal_smhandle, event); + return detail::submit_release(queue, internal_smhandle, { event }); } bool set_matrix_property(sycl::queue & /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle, diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp index efadd72e7..24a61ce5e 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.hpp +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.hpp @@ -26,6 +26,8 @@ #include #include "sparse_blas/generic_container.hpp" +#include "sparse_blas/macros.hpp" +#include "sparse_blas/sycl_helper.hpp" namespace oneapi::mkl::sparse { diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx index dad611252..acde45cb4 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx @@ -50,35 +50,9 @@ void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose o oneapi::mkl::sparse::dense_matrix_handle_t B_handle, oneapi::mkl::sparse::dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, B_handle); - THROW_IF_NULLPTR(function_name, C_handle); - auto internal_A_handle = detail::get_internal_handle(A_handle); - detail::check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle); - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible); - detail::check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible); - } - if (is_alpha_host_accessible != is_beta_host_accessible) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Alpha and beta must both be placed on host memory or device memory."); - } - if (B_handle->dense_layout != C_handle->dense_layout) { - throw mkl::invalid_argument("sparse_blas", function_name, - "B and C matrices must used the same layout."); - } - - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::general`."); - } - - if (A_view.diag_view != oneapi::mkl::diag::nonunit) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix's diag_view must be `nonunit`."); - } + detail::check_valid_spmm_common(function_name, A_view, internal_A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); #if BACKEND == gpu detail::data_type data_type = internal_A_handle->get_value_type(); diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx index d2332286b..cba197848 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx @@ -49,32 +49,9 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o oneapi::mkl::sparse::dense_vector_handle_t x_handle, oneapi::mkl::sparse::dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, x_handle); - THROW_IF_NULLPTR(function_name, y_handle); - auto internal_A_handle = detail::get_internal_handle(A_handle); - detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible); - detail::check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible); - } - if (is_alpha_host_accessible != is_beta_host_accessible) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Alpha and beta must both be placed on host memory or device memory."); - } - if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type cannot be diagonal."); - } - - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular && - A_view.diag_view == oneapi::mkl::diag::unit) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "`unit` diag_view can only be used with a triangular type_view."); - } + detail::check_valid_spmv_common(__func__, opA, A_view, internal_A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); if ((A_view.type_view == oneapi::mkl::sparse::matrix_descr::symmetric || A_view.type_view == oneapi::mkl::sparse::matrix_descr::hermitian) && diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx index 7ef5b3c39..01575ac36 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx @@ -49,11 +49,10 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o oneapi::mkl::sparse::dense_vector_handle_t x_handle, oneapi::mkl::sparse::dense_vector_handle_t y_handle, bool is_alpha_host_accessible, oneapi::mkl::sparse::spsv_alg alg) { - THROW_IF_NULLPTR(function_name, A_handle); - THROW_IF_NULLPTR(function_name, x_handle); - THROW_IF_NULLPTR(function_name, y_handle); - auto internal_A_handle = detail::get_internal_handle(A_handle); + detail::check_valid_spsv_common(function_name, A_view, internal_A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg && !internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::sorted)) { throw mkl::unimplemented( @@ -72,16 +71,6 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o #else (void)opA; #endif // BACKEND - - detail::check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); - if (A_view.type_view != matrix_descr::triangular) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::triangular`."); - } - - if (internal_A_handle->all_use_buffer()) { - detail::check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible); - } } void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp index a6ea51629..0aaf91b25 100644 --- a/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp +++ b/src/sparse_blas/backends/mklcpu/mklcpu_handles.cpp @@ -19,7 +19,7 @@ #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" namespace oneapi::mkl::sparse::mklcpu { diff --git a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp index 0929a7ef4..ebc8ceecf 100644 --- a/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp +++ b/src/sparse_blas/backends/mklcpu/mklcpu_operations.cpp @@ -17,10 +17,12 @@ * **************************************************************************/ +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/common_op_verification.hpp" #include "sparse_blas/macros.hpp" #include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" #include "oneapi/mkl/sparse_blas/detail/mklcpu/onemkl_sparse_blas_mklcpu.hpp" diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp index 7cb9853a7..648fed66e 100644 --- a/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp +++ b/src/sparse_blas/backends/mklgpu/mklgpu_handles.cpp @@ -19,8 +19,8 @@ #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp" +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" namespace oneapi::mkl::sparse::mklgpu { diff --git a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp index be5e0c0aa..1102306dc 100644 --- a/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp +++ b/src/sparse_blas/backends/mklgpu/mklgpu_operations.cpp @@ -17,10 +17,12 @@ * **************************************************************************/ +#include "sparse_blas/backends/mkl_common/mkl_dispatch.hpp" #include "sparse_blas/backends/mkl_common/mkl_handles.hpp" -#include "sparse_blas/backends/mkl_common/mkl_helper.hpp" +#include "sparse_blas/common_op_verification.hpp" #include "sparse_blas/macros.hpp" #include "sparse_blas/matrix_view_comparison.hpp" +#include "sparse_blas/sycl_helper.hpp" #include "oneapi/mkl/sparse_blas/detail/mklgpu/onemkl_sparse_blas_mklgpu.hpp" diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp new file mode 100644 index 000000000..e496c725e --- /dev/null +++ b/src/sparse_blas/common_op_verification.hpp @@ -0,0 +1,142 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ + +#include + +#if __has_include() +#include +#else +#include +#endif + +#include "oneapi/mkl/sparse_blas/types.hpp" +#include "macros.hpp" + +namespace oneapi::mkl::sparse::detail { + +/// Throw an exception if the scalar is not accessible in the host +inline void check_ptr_is_host_accessible(const std::string &function_name, + const std::string &scalar_name, + bool is_ptr_accessible_on_host) { + if (!is_ptr_accessible_on_host) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Scalar " + scalar_name + " must be accessible on the host for buffer functions."); + } +} + +template +void check_valid_spmm_common(const std::string &function_name, + oneapi::mkl::sparse::matrix_view A_view, + InternalSparseMatHandleT internal_A_handle, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, + oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, B_handle); + THROW_IF_NULLPTR(function_name, C_handle); + + check_all_containers_compatible(function_name, internal_A_handle, B_handle, C_handle); + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spmm", "alpha", is_alpha_host_accessible); + check_ptr_is_host_accessible("spmm", "beta", is_beta_host_accessible); + } + if (is_alpha_host_accessible != is_beta_host_accessible) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Alpha and beta must both be placed on host memory or device memory."); + } + if (B_handle->dense_layout != C_handle->dense_layout) { + throw mkl::invalid_argument("sparse_blas", function_name, + "B and C matrices must used the same layout."); + } + + if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix view's type must be `matrix_descr::general`."); + } + + if (A_view.diag_view != oneapi::mkl::diag::nonunit) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix's diag_view must be `nonunit`."); + } +} + +template +void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::transpose /*opA*/, + oneapi::mkl::sparse::matrix_view A_view, + InternalSparseMatHandleT internal_A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, x_handle); + THROW_IF_NULLPTR(function_name, y_handle); + + check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spmv", "alpha", is_alpha_host_accessible); + check_ptr_is_host_accessible("spmv", "beta", is_beta_host_accessible); + } + if (is_alpha_host_accessible != is_beta_host_accessible) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Alpha and beta must both be placed on host memory or device memory."); + } + if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix view's type cannot be diagonal."); + } + + if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular && + A_view.diag_view == oneapi::mkl::diag::unit) { + throw mkl::invalid_argument( + "sparse_blas", function_name, + "`unit` diag_view can only be used with a triangular type_view."); + } +} + +template +void check_valid_spsv_common(const std::string &function_name, + oneapi::mkl::sparse::matrix_view A_view, + InternalSparseMatHandleT internal_A_handle, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, + oneapi::mkl::sparse::dense_vector_handle_t y_handle, + bool is_alpha_host_accessible) { + THROW_IF_NULLPTR(function_name, internal_A_handle); + THROW_IF_NULLPTR(function_name, x_handle); + THROW_IF_NULLPTR(function_name, y_handle); + + check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); + if (A_view.type_view != matrix_descr::triangular) { + throw mkl::invalid_argument("sparse_blas", function_name, + "Matrix view's type must be `matrix_descr::triangular`."); + } + + if (internal_A_handle->all_use_buffer()) { + check_ptr_is_host_accessible("spsv", "alpha", is_alpha_host_accessible); + } +} + +} // namespace oneapi::mkl::sparse::detail + +#endif // _ONEMKL_SRC_SPARSE_BLAS_COMMON_OP_VERIFICATION_HPP_ \ No newline at end of file diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 53bd50837..5fe2b1ab2 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -61,6 +61,10 @@ struct generic_container { buffer_ptr(std::make_shared>(buffer)), data_type(get_data_type()) {} + bool use_buffer() const { + return static_cast(buffer_ptr); + } + template void set_usm_ptr(T* ptr) { usm_ptr = ptr; @@ -108,7 +112,7 @@ struct generic_dense_handle { value_container(value_buffer) {} bool all_use_buffer() const { - return static_cast(value_container.buffer_ptr); + return value_container.use_buffer(); } data_type get_value_type() const { @@ -210,34 +214,47 @@ struct generic_sparse_handle { generic_container col_container; generic_container value_container; + std::int64_t num_rows; + std::int64_t num_cols; + std::int64_t nnz; + oneapi::mkl::index_base index; std::int32_t properties_mask; bool can_be_reset; template generic_sparse_handle(BackendHandleT backend_handle, intType* row_ptr, intType* col_ptr, - fpType* value_ptr) + fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols, + std::int64_t nnz, oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(generic_container(row_ptr)), col_container(generic_container(col_ptr)), value_container(generic_container(value_ptr)), + num_rows(num_rows), + num_cols(num_cols), + nnz(nnz), + index(index), properties_mask(0), can_be_reset(true) {} template generic_sparse_handle(BackendHandleT backend_handle, const sycl::buffer row_buffer, const sycl::buffer col_buffer, - const sycl::buffer value_buffer) + const sycl::buffer value_buffer, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(row_buffer), col_container(col_buffer), value_container(value_buffer), + num_rows(num_rows), + num_cols(num_cols), + nnz(nnz), + index(index), properties_mask(0), can_be_reset(true) {} bool all_use_buffer() const { - return static_cast(value_container.buffer_ptr) && - static_cast(row_container.buffer_ptr) && - static_cast(col_container.buffer_ptr); + return value_container.use_buffer() && row_container.use_buffer() && + col_container.use_buffer(); } data_type get_value_type() const { @@ -321,12 +338,38 @@ void check_all_containers_compatible(const std::string& function_name, } } -template -sycl::event submit_release(sycl::queue& queue, T* ptr, const DependenciesT& dependencies) { - return queue.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - cgh.host_task([=]() { delete ptr; }); - }); +template +void check_can_reset_value_handle(const std::string& function_name, + InternalHandleT* internal_handle, bool expect_buffer) { + if (internal_handle->get_value_type() != detail::get_data_type()) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, + "Incompatible data types expected " + + data_type_to_str(internal_handle->get_value_type()) + " but got " + + data_type_to_str(detail::get_data_type())); + } + if (internal_handle->all_use_buffer() != expect_buffer) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, "Cannot change the container type between buffer or USM"); + } +} + +template +void check_can_reset_sparse_handle(const std::string& function_name, + InternalHandleT* internal_smhandle, bool expect_buffer) { + check_can_reset_value_handle(function_name, internal_smhandle, expect_buffer); + if (internal_smhandle->get_int_type() != detail::get_data_type()) { + throw oneapi::mkl::invalid_argument( + "sparse_blas", function_name, + "Incompatible data types expected " + + data_type_to_str(internal_smhandle->get_int_type()) + " but got " + + data_type_to_str(detail::get_data_type())); + } + if (!internal_smhandle->can_be_reset) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support reseting the matrix handle's data after it was used in a computation."); + } } } // namespace oneapi::mkl::sparse::detail diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp index 7eba01390..9eb769736 100644 --- a/src/sparse_blas/macros.hpp +++ b/src/sparse_blas/macros.hpp @@ -36,10 +36,91 @@ FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int32_t, _i32); \ FOR_EACH_FP_AND_INT_TYPE_HELPER(DEFINE_MACRO, std::int64_t, _i64) +#define INSTANTIATE_DENSE_VECTOR_FUNCS(FP_TYPE, FP_SUFFIX) \ + template void init_dense_vector( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ + std::int64_t size, sycl::buffer val); \ + template void init_dense_vector( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ + std::int64_t size, FP_TYPE * val); \ + template void set_dense_vector_data( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ + std::int64_t size, sycl::buffer val); \ + template void set_dense_vector_data( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ + std::int64_t size, FP_TYPE * val) + +#define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ + template void init_dense_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, sycl::buffer val); \ + template void init_dense_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, FP_TYPE * val); \ + template void set_dense_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, sycl::buffer val); \ + template void set_dense_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ + oneapi::mkl::layout dense_layout, FP_TYPE * val) + +#define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ + template void init_coo_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, sycl::buffer row_ind, \ + sycl::buffer col_ind, sycl::buffer val); \ + template void init_coo_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val); \ + template void set_coo_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + sycl::buffer row_ind, sycl::buffer col_ind, \ + sycl::buffer val); \ + template void set_coo_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val) + +#define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ + template void init_csr_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, sycl::buffer row_ptr, \ + sycl::buffer col_ind, sycl::buffer val); \ + template void init_csr_matrix( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ + oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val); \ + template void set_csr_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + sycl::buffer row_ptr, sycl::buffer col_ind, \ + sycl::buffer val); \ + template void set_csr_matrix_data( \ + sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ + INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val) + #define THROW_IF_NULLPTR(FUNC_NAME, PTR) \ if (!(PTR)) { \ throw mkl::uninitialized("sparse_blas", FUNC_NAME, \ std::string(#PTR) + " must not be nullptr."); \ } +#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name) \ + do { \ + if (descr->last_optimized_##argument != argument) { \ + throw mkl::invalid_argument( \ + "sparse_blas", __func__, \ + #argument " argument must match with the previous call to " #optimize_func_name); \ + } \ + } while (0) + #endif // _ONEMKL_SPARSE_BLAS_MACROS_HPP_ diff --git a/src/sparse_blas/sycl_helper.hpp b/src/sparse_blas/sycl_helper.hpp new file mode 100644 index 000000000..67580159c --- /dev/null +++ b/src/sparse_blas/sycl_helper.hpp @@ -0,0 +1,80 @@ +/*************************************************************************** +* Copyright (C) Codeplay Software Limited +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* For your convenience, a copy of the License has been included in this +* repository. +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +**************************************************************************/ + +#ifndef _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ +#define _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ + +#if __has_include() +#include +#else +#include +#endif + +namespace oneapi::mkl::sparse::detail { + +/// Return whether a pointer is accessible on the host +template +inline bool is_ptr_accessible_on_host(sycl::queue queue, const T *host_or_device_ptr) { + auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context()); + return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared || + alloc_type == sycl::usm::alloc::unknown; +} + +/// Return a scalar on the host from a pointer to host or device memory +template +inline T get_scalar_on_host(sycl::queue &queue, const T *host_or_device_ptr, + bool is_ptr_accessible_on_host) { + if (is_ptr_accessible_on_host) { + return *host_or_device_ptr; + } + T scalar; + auto event = queue.copy(host_or_device_ptr, &scalar, 1); + event.wait_and_throw(); + return scalar; +} + +/// Submit the release of \p ptr in a host_task waiting on the dependencies +template +sycl::event submit_release(sycl::queue &queue, T *ptr, + const std::vector &dependencies) { + return queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() { delete ptr; }); + }); +} + +/// Merge multiple event dependencies into one +inline sycl::event collapse_dependencies(sycl::queue &queue, + const std::vector &dependencies) { + if (dependencies.empty()) { + return {}; + } + else if (dependencies.size() == 1) { + return dependencies[0]; + } + + return queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.host_task([=]() {}); + }); +} + +} // namespace oneapi::mkl::sparse::detail + +#endif // _ONEMKL_SRC_SPARSE_BLAS_SYCL_HELPER_HPP_ diff --git a/tests/unit_tests/CMakeLists.txt b/tests/unit_tests/CMakeLists.txt index e7fe8e110..5fc56d04a 100644 --- a/tests/unit_tests/CMakeLists.txt +++ b/tests/unit_tests/CMakeLists.txt @@ -178,6 +178,11 @@ foreach(domain ${TARGET_DOMAINS}) list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_dft_portfft) endif() + if(domain STREQUAL "sparse_blas" AND ENABLE_CUSPARSE_BACKEND) + add_dependencies(test_main_${domain}_ct onemkl_${domain}_cusparse) + list(APPEND ONEMKL_LIBRARIES_${domain} onemkl_${domain}_cusparse) + endif() + target_link_libraries(test_main_${domain}_ct PUBLIC gtest gtest_main diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp index ad215761f..5457079e0 100644 --- a/tests/unit_tests/include/test_helper.hpp +++ b/tests/unit_tests/include/test_helper.hpp @@ -176,6 +176,13 @@ #define TEST_RUN_PORTFFT_SELECT(q, func, ...) #endif +#ifdef ENABLE_CUSPARSE_BACKEND +#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...) \ + func(oneapi::mkl::backend_selector{ q }, __VA_ARGS__) +#else +#define TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, ...) +#endif + #ifndef __HIPSYCL__ #define CHECK_HOST_OR_CPU(q) q.get_device().is_cpu() #else @@ -268,6 +275,9 @@ if (vendor_id == INTEL_ID) { \ TEST_RUN_INTELGPU_SELECT(q, func, __VA_ARGS__); \ } \ + else if (vendor_id == NVIDIA_ID) { \ + TEST_RUN_NVIDIAGPU_CUSPARSE_SELECT(q, func, __VA_ARGS__); \ + } \ } \ } while (0); diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp index bac3f8c83..fc208da09 100644 --- a/tests/unit_tests/main_test.cpp +++ b/tests/unit_tests/main_test.cpp @@ -122,7 +122,8 @@ int main(int argc, char** argv) { #endif #if !defined(ENABLE_CUBLAS_BACKEND) && !defined(ENABLE_CURAND_BACKEND) && \ !defined(ENABLE_CUSOLVER_BACKEND) && !defined(ENABLE_PORTBLAS_BACKEND_NVIDIA_GPU) && \ - !defined(ENABLE_CUFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND) + !defined(ENABLE_CUFFT_BACKEND) && !defined(ENABLE_PORTFFT_BACKEND) && \ + !defined(ENABLE_CUSPARSE_BACKEND) if (dev.is_gpu() && vendor_id == NVIDIA_ID) continue; #endif diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp index c11255a9a..a02f91789 100644 --- a/tests/unit_tests/sparse_blas/include/test_common.hpp +++ b/tests/unit_tests/sparse_blas/include/test_common.hpp @@ -332,13 +332,18 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow throw std::runtime_error("Unsupported sparse format"); } +inline bool require_coo_sorted_by_row(sycl::queue queue) { + auto vendor_id = oneapi::mkl::get_device_id(queue); + return vendor_id == oneapi::mkl::device::nvidiagpu; +} + /// Shuffle the 3arrays CSR or COO representation (ia, ja, values) /// of any sparse matrix. /// In CSR format, the elements within a row are shuffled without changing ia. /// In COO format, all the elements are shuffled. template -void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intType *ia, - intType *ja, fpType *a, intType nnz, std::size_t nrows) { +void shuffle_sparse_matrix(sycl::queue queue, sparse_matrix_format_t format, intType indexing, + intType *ia, intType *ja, fpType *a, intType nnz, std::size_t nrows) { if (format == sparse_matrix_format_t::CSR) { for (std::size_t i = 0; i < nrows; ++i) { intType nnz_row = ia[i + 1] - ia[i]; @@ -351,12 +356,33 @@ void shuffle_sparse_matrix(sparse_matrix_format_t format, intType indexing, intT } } else if (format == sparse_matrix_format_t::COO) { - for (std::size_t i = 0; i < static_cast(nnz); ++i) { - intType q = std::rand() % nnz; - // Swap elements i and q - std::swap(ia[q], ia[i]); - std::swap(ja[q], ja[i]); - std::swap(a[q], a[i]); + if (require_coo_sorted_by_row(queue)) { + std::size_t linear_idx = 0; + for (std::size_t i = 0; i < nrows; ++i) { + // Count the number of non-zero elements for the given row + std::size_t nnz_row = 1; + while (linear_idx + nnz_row < static_cast(nnz) && + ia[linear_idx] == ia[linear_idx + nnz_row]) { + ++nnz_row; + } + for (std::size_t j = 0; j < nnz_row; ++j) { + // Swap elements within the same row + std::size_t q = linear_idx + (static_cast(std::rand()) % nnz_row); + // Swap elements j and q + std::swap(ja[q], ja[linear_idx + j]); + std::swap(a[q], a[linear_idx + j]); + } + linear_idx += nnz_row; + } + } + else { + for (std::size_t i = 0; i < static_cast(nnz); ++i) { + intType q = std::rand() % nnz; + // Swap elements i and q + std::swap(ia[q], ia[i]); + std::swap(ja[q], ja[i]); + std::swap(a[q], a[i]); + } } } else { diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp index b6f9e1185..df6fb850b 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp @@ -74,8 +74,8 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast(nrows_A)); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, static_cast(nrows_A)); } auto ia_buf = make_buffer(ia_host); @@ -120,7 +120,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); } if (reset_nnz > nnz) { diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 5778430a6..7d30426c4 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -70,8 +70,8 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast(nrows_A)); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, static_cast(nrows_A)); } auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); @@ -153,7 +153,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); } if (reset_nnz > nnz) { diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp index 3d99f9e94..e03c09ebe 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp @@ -67,8 +67,8 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast(nrows_A)); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, static_cast(nrows_A)); } auto ia_buf = make_buffer(ia_host); @@ -110,7 +110,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); } if (reset_nnz > nnz) { diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index ded92a770..eb54f6a5d 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -63,8 +63,8 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - static_cast(nrows_A)); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, static_cast(nrows_A)); } auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); @@ -145,7 +145,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); } if (reset_nnz > nnz) { diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index 6b276dff4..b64219b9a 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -70,8 +70,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - mu); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, mu); } auto ia_buf = make_buffer(ia_host); @@ -110,7 +110,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, mu); } if (reset_nnz > nnz) { @@ -170,8 +170,11 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl y_ref_host.data()); // Compare the results of reference implementation and DPC++ implementation. + // Increase default relative error margin for tests that lead to large numeric values. + double abs_error_factor = 10; + double rel_error_factor = 1E5; auto y_acc = y_buf.get_host_access(sycl::read_only); - bool valid = check_equal_vector(y_acc, y_ref_host); + bool valid = check_equal_vector(y_acc, y_ref_host, abs_error_factor, rel_error_factor); return static_cast(valid); } diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 3b58db914..be427d011 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -66,8 +66,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl // Shuffle ordering of column indices/values to test sortedness if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), a_host.data(), nnz, - mu); + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), + a_host.data(), nnz, mu); } auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); @@ -141,7 +141,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); if (!is_sorted) { - shuffle_sparse_matrix(format, indexing, ia_host.data(), ja_host.data(), + shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, mu); } if (reset_nnz > nnz) { @@ -218,8 +218,11 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl y_ref_host.data()); // Compare the results of reference implementation and DPC++ implementation. + // Increase default relative error margin for tests that lead to large numeric values. + double abs_error_factor = 10; + double rel_error_factor = 1E5; ev_copy.wait_and_throw(); - bool valid = check_equal_vector(y_host, y_ref_host); + bool valid = check_equal_vector(y_host, y_ref_host, abs_error_factor, rel_error_factor); return static_cast(valid); } From 3d02b34206d83f36c18e3532f5b492c73ab68c91 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 10 Sep 2024 17:21:01 +0200 Subject: [PATCH 02/43] Remove previous compile time example --- .../sparse_blas_spmv_usm_mklcpu.cpp | 285 ------------------ 1 file changed, 285 deletions(-) delete mode 100644 examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp deleted file mode 100644 index 4ab078601..000000000 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu.cpp +++ /dev/null @@ -1,285 +0,0 @@ -/******************************************************************************* -* Copyright 2023 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, -* software distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions -* and limitations under the License. -* -* -* SPDX-License-Identifier: Apache-2.0 -*******************************************************************************/ - -/* -* -* Content: -* This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv -* using unified shared memory to perform general sparse matrix-vector -* multiplication on a INTEL CPU SYCL device. -* -* y = alpha * op(A) * x + beta * y -* -* where op() is defined by one of -* -* oneapi::mkl::transpose::{nontrans,trans,conjtrans} -* -* -* This example demonstrates only single precision (float) data type for -* spmv matrix data -* -* -*******************************************************************************/ - -// stl includes -#include -#include - -#if __has_include() -#include -#else -#include -#endif -#include "oneapi/mkl.hpp" - -#include "example_helper.hpp" - -// -// Main example for Sparse Matrix-Vector Multiply consisting of -// initialization of A matrix, x and y vectors as well as -// scalars alpha and beta. Then the product -// -// y = alpha * op(A) * x + beta * y -// -// is performed and finally the results are post processed. -// -template -int run_sparse_matrix_vector_multiply_example(const sycl::device &cpu_dev) { - // Matrix data size - intType size = 4; - intType nrows = size * size * size; - - // Set scalar fp values - fp alpha = set_fp_value(fp(1.0)); - fp beta = set_fp_value(fp(0.0)); - - // Catch asynchronous exceptions - auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { - try { - std::rethrow_exception(e); - } - catch (sycl::exception const &e) { - std::cout << "Caught asynchronous SYCL " - "exception during sparse::spmv:\n" - << e.what() << std::endl; - } - } - }; - - // create execution queue and buffers of matrix data - sycl::queue cpu_queue(cpu_dev, exception_handler); - oneapi::mkl::backend_selector cpu_selector{ cpu_queue }; - - intType *ia, *ja; - fp *a, *x, *y, *z; - std::size_t sizea = static_cast(27 * nrows); - std::size_t sizeja = static_cast(27 * nrows); - std::size_t sizeia = static_cast(nrows + 1); - std::size_t sizevec = static_cast(nrows); - - ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), cpu_queue); - ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), cpu_queue); - a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), cpu_queue); - x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), cpu_queue); - - if (!ia || !ja || !a || !x || !y || !z) { - throw std::runtime_error("Failed to allocate USM memory"); - } - - intType nnz = generate_sparse_matrix(size, ia, ja, a); - - // Init vectors x and y - for (int i = 0; i < nrows; i++) { - x[i] = set_fp_value(fp(1.0)); - y[i] = set_fp_value(fp(0.0)); - z[i] = set_fp_value(fp(0.0)); - } - - std::vector int_ptr_vec; - int_ptr_vec.push_back(ia); - int_ptr_vec.push_back(ja); - std::vector fp_ptr_vec; - fp_ptr_vec.push_back(a); - fp_ptr_vec.push_back(x); - fp_ptr_vec.push_back(y); - fp_ptr_vec.push_back(z); - - // - // Execute Matrix Multiply - // - - oneapi::mkl::transpose transA = oneapi::mkl::transpose::nontrans; - oneapi::mkl::sparse::spmv_alg alg = oneapi::mkl::sparse::spmv_alg::default_alg; - oneapi::mkl::sparse::matrix_view A_view; - - std::cout << "\n\t\tsparse::spmv parameters:\n"; - std::cout << "\t\t\ttransA = " - << (transA == oneapi::mkl::transpose::nontrans - ? "nontrans" - : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans")) - << std::endl; - std::cout << "\t\t\tnrows = " << nrows << std::endl; - std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl; - - // Create and initialize handle for a Sparse Matrix in CSR format - oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr; - oneapi::mkl::sparse::init_csr_matrix(cpu_selector, &A_handle, nrows, nrows, nnz, - oneapi::mkl::index_base::zero, ia, ja, a); - - // Create and initialize dense vector handles - oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr; - oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr; - oneapi::mkl::sparse::init_dense_vector(cpu_selector, &x_handle, sizevec, x); - oneapi::mkl::sparse::init_dense_vector(cpu_selector, &y_handle, sizevec, y); - - // Create operation descriptor - oneapi::mkl::sparse::spmv_descr_t descr = nullptr; - oneapi::mkl::sparse::init_spmv_descr(cpu_selector, &descr); - - // Allocate external workspace - std::size_t workspace_size = 0; - oneapi::mkl::sparse::spmv_buffer_size(cpu_selector, transA, &alpha, A_view, A_handle, x_handle, - &beta, y_handle, alg, descr, workspace_size); - void *workspace = sycl::malloc_device(workspace_size, cpu_queue); - - // Optimize spmv - auto ev_opt = - oneapi::mkl::sparse::spmv_optimize(cpu_selector, transA, &alpha, A_view, A_handle, x_handle, - &beta, y_handle, alg, descr, workspace); - - // Run spmv - auto ev_spmv = oneapi::mkl::sparse::spmv(cpu_selector, transA, &alpha, A_view, A_handle, - x_handle, &beta, y_handle, alg, descr, { ev_opt }); - - // Release handles and descriptor - std::vector release_events; - release_events.push_back( - oneapi::mkl::sparse::release_dense_vector(cpu_selector, x_handle, { ev_spmv })); - release_events.push_back( - oneapi::mkl::sparse::release_dense_vector(cpu_selector, y_handle, { ev_spmv })); - release_events.push_back( - oneapi::mkl::sparse::release_sparse_matrix(cpu_selector, A_handle, { ev_spmv })); - release_events.push_back( - oneapi::mkl::sparse::release_spmv_descr(cpu_selector, descr, { ev_spmv })); - for (auto event : release_events) { - event.wait_and_throw(); - } - - // - // Post Processing - // - - fp *res = y; - const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); - for (intType row = 0; row < nrows; row++) { - z[row] *= beta; - } - for (intType row = 0; row < nrows; row++) { - fp tmp = alpha * x[row]; - for (intType i = ia[row]; i < ia[row + 1]; i++) { - if constexpr (is_complex()) { - z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); - } - else { - z[ja[i]] += tmp * a[i]; - } - } - } - - bool good = true; - for (intType row = 0; row < nrows; row++) { - good &= check_result(res[row], z[row], nrows, row); - } - - std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished" - << std::endl; - - free_vec(fp_ptr_vec, cpu_queue); - free_vec(int_ptr_vec, cpu_queue); - - if (!good) - return 1; - - return 0; -} - -// -// Description of example setup, apis used and supported floating point type -// precisions -// -void print_example_banner() { - std::cout << "" << std::endl; - std::cout << "########################################################################" - << std::endl; - std::cout << "# Sparse Matrix-Vector Multiply Example: " << std::endl; - std::cout << "# " << std::endl; - std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl; - std::cout << "# " << std::endl; - std::cout << "# where A is a sparse matrix in CSR format, x and y are " - "dense vectors" - << std::endl; - std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl; - std::cout << "# " << std::endl; - std::cout << "# Using apis:" << std::endl; - std::cout << "# sparse::spmv" << std::endl; - std::cout << "# " << std::endl; - std::cout << "# Using single precision (float) data type" << std::endl; - std::cout << "# " << std::endl; - std::cout << "# Running on Intel CPU device" << std::endl; - std::cout << "# " << std::endl; - std::cout << "########################################################################" - << std::endl; - std::cout << std::endl; -} - -// -// Main entry point for example -// -int main(int /*argc*/, char ** /*argv*/) { - print_example_banner(); - - try { - // TODO: Add cuSPARSE compile-time dispatcher in this example once it is supported. - sycl::device cpu_dev(sycl::cpu_selector_v); - - std::cout << "Running Sparse BLAS SPMV USM example on CPU device." << std::endl; - std::cout << "Device name is: " << cpu_dev.get_info() - << std::endl; - std::cout << "Running with single precision real data type:" << std::endl; - - run_sparse_matrix_vector_multiply_example(cpu_dev); - std::cout << "Sparse BLAS SPMV USM example ran OK." << std::endl; - } - catch (sycl::exception const &e) { - std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl; - std::cerr << "\t" << e.what() << std::endl; - std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; - return 1; - } - catch (std::exception const &e) { - std::cerr << "Caught std::exception during Sparse SPMV:" << std::endl; - std::cerr << "\t" << e.what() << std::endl; - return 1; - } - - return 0; -} From 112222161dddfc6215e1b625859ae8100df6f213 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 10 Sep 2024 17:21:30 +0200 Subject: [PATCH 03/43] Update compile time example description --- .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp index d025539f8..f3fc5b416 100644 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -22,7 +22,7 @@ * Content: * This example demonstrates use of DPCPP API oneapi::mkl::sparse::spmv * using unified shared memory to perform general sparse matrix-vector -* multiplication on a INTEL CPU SYCL device. +* multiplication on a INTEL CPU SYCL device and an NVIDIA GPU SYCL device. * * y = alpha * op(A) * x + beta * y * From f42eab5b542fe7d3a76303fd88b6820629713ab2 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 10 Sep 2024 17:23:19 +0200 Subject: [PATCH 04/43] Remove unused mkl_helper file --- .../backends/mkl_common/mkl_helper.hpp | 111 ------------------ 1 file changed, 111 deletions(-) delete mode 100644 src/sparse_blas/backends/mkl_common/mkl_helper.hpp diff --git a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp b/src/sparse_blas/backends/mkl_common/mkl_helper.hpp deleted file mode 100644 index ca15c5b4f..000000000 --- a/src/sparse_blas/backends/mkl_common/mkl_helper.hpp +++ /dev/null @@ -1,111 +0,0 @@ -/*************************************************************************** -* Copyright (C) Codeplay Software Limited -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* For your convenience, a copy of the License has been included in this -* repository. -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -**************************************************************************/ - -#ifndef _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ -#define _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ - -#if __has_include() -#include -#else -#include -#endif - -#include "oneapi/mkl/exceptions.hpp" -#include "oneapi/mkl/sparse_blas/detail/helper_types.hpp" - -#include "sparse_blas/enum_data_types.hpp" -#include "sparse_blas/macros.hpp" - -namespace oneapi::mkl::sparse::detail { - -/// Return whether a pointer is accessible on the host -template -inline bool is_ptr_accessible_on_host(sycl::queue &queue, const T *host_or_device_ptr) { - auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context()); - return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared || - alloc_type == sycl::usm::alloc::unknown; -} - -/// Throw an exception if the scalar is not accessible in the host -inline void check_ptr_is_host_accessible(const std::string &function_name, - const std::string &scalar_name, - bool is_ptr_accessible_on_host) { - if (!is_ptr_accessible_on_host) { - throw mkl::invalid_argument( - "sparse_blas", function_name, - "Scalar " + scalar_name + " must be accessible on the host for buffer functions."); - } -} - -/// Return a scalar on the host from a pointer to host or device memory -/// Used for USM functions -template -inline T get_scalar_on_host(sycl::queue &queue, const T *host_or_device_ptr, - bool is_ptr_accessible_on_host) { - if (is_ptr_accessible_on_host) { - return *host_or_device_ptr; - } - T scalar; - auto event = queue.copy(host_or_device_ptr, &scalar, 1); - event.wait_and_throw(); - return scalar; -} - -/// Merge multiple event dependencies into one -inline sycl::event collapse_dependencies(sycl::queue &queue, - const std::vector &dependencies) { - if (dependencies.empty()) { - return {}; - } - else if (dependencies.size() == 1) { - return dependencies[0]; - } - - return queue.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependencies); - cgh.host_task([=]() {}); - }); -} - -/// Convert \p value_type to template type argument and use it to call \p op_functor. -#define DISPATCH_MKL_OPERATION(function_name, value_type, op_functor, ...) \ - switch (value_type) { \ - case detail::data_type::real_fp32: return op_functor(__VA_ARGS__); \ - case detail::data_type::real_fp64: return op_functor(__VA_ARGS__); \ - case detail::data_type::complex_fp32: return op_functor>(__VA_ARGS__); \ - case detail::data_type::complex_fp64: \ - return op_functor>(__VA_ARGS__); \ - default: \ - throw oneapi::mkl::exception( \ - "sparse_blas", function_name, \ - "Internal error: unsupported type " + data_type_to_str(value_type)); \ - } - -#define CHECK_DESCR_MATCH(descr, argument, optimize_func_name) \ - do { \ - if (descr->last_optimized_##argument != argument) { \ - throw mkl::invalid_argument( \ - "sparse_blas", __func__, \ - #argument " argument must match with the previous call to " #optimize_func_name); \ - } \ - } while (0) - -} // namespace oneapi::mkl::sparse::detail - -#endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_MKL_COMMON_MKL_HELPER_HPP_ From 8584899e00fc00b2f5f9ee46ed82129952f7e8fa Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 10 Sep 2024 17:44:40 +0200 Subject: [PATCH 05/43] Update README with cuSPARSE --- README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index fdbd4b8e9..c5e3eb63d 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,8 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). - oneMKL interface - oneMKL selector + oneMKL interface + oneMKL selector Intel(R) oneAPI Math Kernel Library (oneMKL) x86 CPU, Intel GPU @@ -28,10 +28,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). NVIDIA cuBLAS NVIDIA GPU - + NVIDIA cuSOLVER NVIDIA GPU - + NVIDIA cuRAND NVIDIA GPU @@ -40,6 +40,10 @@ oneMKL is part of the [UXL Foundation](http://www.uxlfoundation.org). NVIDIA cuFFT NVIDIA GPU + + NVIDIA cuSPARSE + NVIDIA GPU + NETLIB LAPACK x86 CPU @@ -317,7 +321,7 @@ Supported compilers include: Dynamic, Static - SPARSE_BLAS + SPARSE_BLAS x86 CPU Intel(R) oneMKL Intel DPC++ @@ -329,6 +333,12 @@ Supported compilers include: Intel DPC++ Dynamic, Static + + NVIDIA GPU + NVIDIA cuSPARSE + Open DPC++ + Dynamic, Static + From 7aa5177635e730a6f394783d996912de40a7be7c Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Wed, 11 Sep 2024 17:07:50 +0200 Subject: [PATCH 06/43] Fix typos and rewording --- src/sparse_blas/common_op_verification.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp index e496c725e..110c6d205 100644 --- a/src/sparse_blas/common_op_verification.hpp +++ b/src/sparse_blas/common_op_verification.hpp @@ -67,12 +67,12 @@ void check_valid_spmm_common(const std::string &function_name, } if (B_handle->dense_layout != C_handle->dense_layout) { throw mkl::invalid_argument("sparse_blas", function_name, - "B and C matrices must used the same layout."); + "B and C matrices must use the same layout."); } if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::general`."); + "Matrix view's `type_view` must be `matrix_descr::general`."); } if (A_view.diag_view != oneapi::mkl::diag::nonunit) { @@ -104,14 +104,14 @@ void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::tran } if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) { throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type cannot be diagonal."); + "Matrix view's `type_view` cannot be diagonal."); } if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular && A_view.diag_view == oneapi::mkl::diag::unit) { throw mkl::invalid_argument( "sparse_blas", function_name, - "`unit` diag_view can only be used with a triangular type_view."); + "`diag_view::unit` can only be used with `type_view::triangular`."); } } @@ -128,8 +128,9 @@ void check_valid_spsv_common(const std::string &function_name, check_all_containers_compatible(function_name, internal_A_handle, x_handle, y_handle); if (A_view.type_view != matrix_descr::triangular) { - throw mkl::invalid_argument("sparse_blas", function_name, - "Matrix view's type must be `matrix_descr::triangular`."); + throw mkl::invalid_argument( + "sparse_blas", function_name, + "Matrix view's `type_view` must be `matrix_descr::triangular`."); } if (internal_A_handle->all_use_buffer()) { From 70cdbe09e5b0168821d5bdc15d43393cd98cf9ae Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 12 Sep 2024 15:43:34 +0200 Subject: [PATCH 07/43] Rework test accuracy for spsv --- tests/unit_tests/sparse_blas/include/test_common.hpp | 4 ++-- tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp | 7 ++++--- tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp | 7 ++++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp index a02f91789..892e0969c 100644 --- a/tests/unit_tests/sparse_blas/include/test_common.hpp +++ b/tests/unit_tests/sparse_blas/include/test_common.hpp @@ -207,9 +207,9 @@ template fpType generate_data(bool is_diag) { rand_scalar rand_data; if (is_diag) { - // Guarantee an amplitude >= 0.1 + // Guarantee a large amplitude fpType sign = (std::rand() % 2) * 2 - 1; - return rand_data(0.1, 0.5) * sign; + return rand_data(10, 20) * sign; } return rand_data(-0.5, 0.5); } diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index b64219b9a..2c7218b59 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -47,6 +47,9 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); + // Use a fixed seed for operations very sensible to the input data + std::srand(1); + // Input matrix std::vector ia_host, ja_host; std::vector a_host; @@ -171,10 +174,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl // Compare the results of reference implementation and DPC++ implementation. // Increase default relative error margin for tests that lead to large numeric values. - double abs_error_factor = 10; - double rel_error_factor = 1E5; auto y_acc = y_buf.get_host_access(sycl::read_only); - bool valid = check_equal_vector(y_acc, y_ref_host, abs_error_factor, rel_error_factor); + bool valid = check_equal_vector(y_acc, y_ref_host); return static_cast(valid); } diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index be427d011..e7be2a4e5 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -43,6 +43,9 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); + // Use a fixed seed for operations very sensible to the input data + std::srand(1); + // Input matrix std::vector ia_host, ja_host; std::vector a_host; @@ -219,10 +222,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl // Compare the results of reference implementation and DPC++ implementation. // Increase default relative error margin for tests that lead to large numeric values. - double abs_error_factor = 10; - double rel_error_factor = 1E5; ev_copy.wait_and_throw(); - bool valid = check_equal_vector(y_host, y_ref_host, abs_error_factor, rel_error_factor); + bool valid = check_equal_vector(y_host, y_ref_host); return static_cast(valid); } From 81733315974546d853fdd91aae9de5fcd597d38e Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 12 Sep 2024 16:56:25 +0200 Subject: [PATCH 08/43] Remove get_mem for USM --- .../backends/cusparse/cusparse_handles.cpp | 47 +++++++++---------- .../cusparse/cusparse_scope_handle.hpp | 5 -- 2 files changed, 21 insertions(+), 31 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index de7236110..1909c8d3c 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -59,8 +59,7 @@ void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, st sc.get_handle(queue); auto cuda_value_type = CudaEnumType::value; cusparseDnVecDescr_t cu_dvhandle; - CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(val), - cuda_value_type); + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type); *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); }); }); @@ -104,13 +103,12 @@ void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, s if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); auto cuda_value_type = CudaEnumType::value; - CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, - sc.get_mem(val), cuda_value_type); + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val, + cuda_value_type); dvhandle->size = size; } else { - CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, - sc.get_mem(val)); + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val); } dvhandle->set_usm_ptr(val); }); @@ -162,8 +160,8 @@ void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, st auto cuda_value_type = CudaEnumType::value; auto cuda_order = get_cuda_order(dense_layout); cusparseDnMatDescr_t cu_dmhandle; - CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, - sc.get_mem(val), cuda_value_type, cuda_order); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, val, + cuda_value_type, cuda_order); *p_dmhandle = new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); }); @@ -218,15 +216,14 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, auto cuda_value_type = CudaEnumType::value; auto cuda_order = get_cuda_order(dense_layout); CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, - num_cols, ld, sc.get_mem(val), cuda_value_type, cuda_order); + num_cols, ld, val, cuda_value_type, cuda_order); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; dmhandle->dense_layout = dense_layout; } else { - CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, - sc.get_mem(val)); + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val); } dmhandle->set_usm_ptr(val); }); @@ -285,9 +282,8 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; - CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, - sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val), - cuda_index_type, cuda_index_base, cuda_value_type); + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind, + col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols, nnz, index); }); @@ -351,16 +347,16 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, - nnz, sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val), - cuda_index_type, cuda_index_base, cuda_value_type); + nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, + cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; smhandle->nnz = nnz; smhandle->index = index; } else { - CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, - sc.get_mem(row_ind), sc.get_mem(col_ind), sc.get_mem(val)); + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, + col_ind, val); } smhandle->row_container.set_usm_ptr(row_ind); smhandle->col_container.set_usm_ptr(col_ind); @@ -411,9 +407,9 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; - CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, - sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val), - cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr, + col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, + cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols, nnz, index); }); @@ -477,17 +473,16 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, - nnz, sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val), - cuda_index_type, cuda_index_type, cuda_index_base, - cuda_value_type); + nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, + cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; smhandle->nnz = nnz; smhandle->index = index; } else { - CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, - sc.get_mem(row_ptr), sc.get_mem(col_ind), sc.get_mem(val)); + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, + col_ind, val); } smhandle->row_container.set_usm_ptr(row_ptr); smhandle->col_container.set_usm_ptr(col_ind); diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp index b56bb07cf..f1bc6cf38 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -81,11 +81,6 @@ class CusparseScopedContextHandler { auto cudaPtr = ih.get_native_mem(acc); return reinterpret_cast(cudaPtr); } - - template - inline void *get_mem(T *ptr) { - return reinterpret_cast(ptr); - } }; } // namespace oneapi::mkl::sparse::cusparse From 9c164252e29049d4fd632b72f374741738144084 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 13 Sep 2024 12:38:00 +0200 Subject: [PATCH 09/43] Map statuses CUSPARSE_STATUS_NOT_INITIALIZED and CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED --- src/sparse_blas/backends/cusparse/cusparse_error.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp index 7d6bf45d7..8d2f66c8f 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_error.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp @@ -82,7 +82,10 @@ inline void check_status(cusparseStatus_t status, const std::string& function, switch (status) { case CUSPARSE_STATUS_NOT_SUPPORTED: throw oneapi::mkl::unimplemented("sparse_blas", function, error_str); + case CUSPARSE_STATUS_NOT_INITIALIZED: + throw oneapi::mkl::uninitialized("sparse_blas", function, error_str); case CUSPARSE_STATUS_INVALID_VALUE: + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: throw oneapi::mkl::invalid_argument("sparse_blas", function, error_str); default: throw oneapi::mkl::exception("sparse_blas", function, error_str); } From 36599a99563820a02338bfdb5572ac12e1b61c35 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 13 Sep 2024 14:35:49 +0200 Subject: [PATCH 10/43] Reword comment --- src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp index f1bc6cf38..0ad3c401a 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -74,8 +74,8 @@ class CusparseScopedContextHandler { /// See get_handle_and_stream cusparseHandle_t get_handle(const sycl::queue &queue); - // This is a work-around function for reinterpret_casting the memory. This - // will be fixed when SYCL-2020 has been implemented for Pi backend. + // Get the native pointer from an accessor. This is a different pointer than + // what can be retrieved with get_multi_ptr. template inline void *get_mem(AccT acc) { auto cudaPtr = ih.get_native_mem(acc); From 3b29e3292e8c158e90f86388174c3ede5853a174 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 20 Sep 2024 11:20:31 +0200 Subject: [PATCH 11/43] Reword comment --- tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp | 2 +- tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index 2c7218b59..3a9d153d6 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -47,7 +47,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); - // Use a fixed seed for operations very sensible to the input data + // Use a fixed seed for operations very sensitive to the input data std::srand(1); // Input matrix diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index e7be2a4e5..6529069f9 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -43,7 +43,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); - // Use a fixed seed for operations very sensible to the input data + // Use a fixed seed for operations very sensitive to the input data std::srand(1); // Input matrix From 84565a9a7725be6efb03ef5bf378b36997d0bb18 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 20 Sep 2024 13:48:31 +0200 Subject: [PATCH 12/43] Remove redundant namespace --- .../cusparse/operations/cusparse_spmm.cpp | 77 ++++++++---------- .../cusparse/operations/cusparse_spmv.cpp | 79 +++++++------------ .../cusparse/operations/cusparse_spsv.cpp | 60 +++++--------- src/sparse_blas/common_op_verification.hpp | 24 +++--- src/sparse_blas/generic_container.hpp | 10 +-- 5 files changed, 97 insertions(+), 153 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 09fe0515e..b1678c078 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -38,11 +38,11 @@ struct spmm_descr { bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; oneapi::mkl::transpose last_optimized_opB; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle; - oneapi::mkl::sparse::spmm_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_matrix_handle_t last_optimized_B_handle; + dense_matrix_handle_t last_optimized_C_handle; + spmm_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse @@ -71,22 +71,20 @@ inline auto get_cuda_spmm_alg(spmm_alg alg) { } } -inline void fallback_alg_if_needed(oneapi::mkl::sparse::spmm_alg& alg, oneapi::mkl::transpose opA, +inline void fallback_alg_if_needed(spmm_alg& alg, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB) { - if (alg == oneapi::mkl::sparse::spmm_alg::csr_alg3 && + if (alg == spmm_alg::csr_alg3 && (opA != oneapi::mkl::transpose::nontrans || opB == oneapi::mkl::transpose::conjtrans)) { // Avoid warnings printed on std::cerr - alg = oneapi::mkl::sparse::spmm_alg::default_alg; + alg = spmm_alg::default_alg; } } void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t& temp_buffer_size) { + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, + std::size_t& temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle, @@ -113,12 +111,11 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mk spmm_descr->buffer_size_called = true; } -inline void common_spmm_optimize( - oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, bool is_alpha_host_accessible, - oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, bool is_beta_host_accessible, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr) { +inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + bool is_alpha_host_accessible, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + bool is_beta_host_accessible, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr) { detail::check_valid_spmm_common("spmm_optimize", A_view, A_handle, B_handle, C_handle, is_alpha_host_accessible, is_beta_host_accessible); if (!spmm_descr->buffer_size_called) { @@ -136,11 +133,9 @@ inline void common_spmm_optimize( } void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void* alpha, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, void* workspace_ptr, + oneapi::mkl::transpose opB, const void* alpha, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg alg, void* workspace_ptr, bool is_alpha_host_accessible) { auto cu_a = A_handle->backend_handle; auto cu_b = B_handle->backend_handle; @@ -157,12 +152,9 @@ void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, } void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, - sycl::buffer workspace) { + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); if (!A_handle->all_use_buffer()) { @@ -172,7 +164,7 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: is_beta_host_accessible, C_handle, alg, spmm_descr); // Copy the buffer to extend its lifetime until the descriptor is free'd. spmm_descr->workspace.set_buffer_untyped(workspace); - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace.size() == 0) { + if (alg == spmm_alg::no_optimize_alg || workspace.size() == 0) { // cusparseSpMM_preprocess cannot be called if the workspace is empty return; } @@ -191,13 +183,10 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: } sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void* alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* workspace, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* workspace, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -207,7 +196,7 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, is_beta_host_accessible, C_handle, alg, spmm_descr); spmm_descr->workspace.usm_ptr = workspace; - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg || workspace == nullptr) { + if (alg == spmm_alg::no_optimize_alg || workspace == nullptr) { // cusparseSpMM_preprocess cannot be called if the workspace is empty return detail::collapse_dependencies(queue, dependencies); } @@ -222,11 +211,9 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index e06f84695..f75464e91 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -37,11 +37,11 @@ struct spmv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spmv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spmv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse @@ -68,14 +68,12 @@ inline auto get_cuda_spmv_alg(spmv_alg alg) { } void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { detail::check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, is_beta_host_accessible); - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { + if (A_view.type_view != matrix_descr::general) { throw mkl::unimplemented( "sparse_blas", function_name, "The backend does not support spmv with a `type_view` other than `matrix_descr::general`."); @@ -83,12 +81,9 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o } void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -114,13 +109,10 @@ void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void } inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - bool is_beta_host_accessible, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, bool is_beta_host_accessible, + dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr) { check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, is_beta_host_accessible); if (!spmv_descr->buffer_size_called) { @@ -139,10 +131,8 @@ inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ #if CUSPARSE_VERSION >= 12300 // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, void *workspace_ptr, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t y_handle, spmv_alg alg, void *workspace_ptr, bool is_alpha_host_accessible) { auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; @@ -159,12 +149,9 @@ void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, #endif void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - sycl::buffer workspace) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, sycl::buffer workspace) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); if (!A_handle->all_use_buffer()) { @@ -174,7 +161,7 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a is_beta_host_accessible, y_handle, alg, spmv_descr); // Copy the buffer to extend its lifetime until the descriptor is free'd. spmv_descr->workspace.set_buffer_untyped(workspace); - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return; } @@ -209,13 +196,10 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, void *workspace, - const std::vector &dependencies) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, + void *workspace, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); if (A_handle->all_use_buffer()) { @@ -224,7 +208,7 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, is_beta_host_accessible, y_handle, alg, spmv_descr); spmv_descr->workspace.usm_ptr = workspace; - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } @@ -242,12 +226,9 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - const std::vector &dependencies) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 2f124caad..5eedeca70 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -37,11 +37,11 @@ struct spsv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spsv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spsv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse @@ -70,12 +70,9 @@ inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { } void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); @@ -101,12 +98,9 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void } inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr) { detail::check_valid_spsv_common("spsv_optimize", A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); if (!spsv_descr->buffer_size_called) { @@ -123,12 +117,9 @@ inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ } void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace_ptr, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void *workspace_ptr, bool is_alpha_host_accessible) { auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; @@ -146,11 +137,8 @@ void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, } void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); if (!A_handle->all_use_buffer()) { @@ -189,12 +177,9 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void *workspace, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); if (A_handle->all_use_buffer()) { @@ -213,11 +198,8 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp index 110c6d205..be31ad43f 100644 --- a/src/sparse_blas/common_op_verification.hpp +++ b/src/sparse_blas/common_op_verification.hpp @@ -45,11 +45,9 @@ inline void check_ptr_is_host_accessible(const std::string &function_name, } template -void check_valid_spmm_common(const std::string &function_name, - oneapi::mkl::sparse::matrix_view A_view, +void check_valid_spmm_common(const std::string &function_name, matrix_view A_view, InternalSparseMatHandleT internal_A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, + dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { THROW_IF_NULLPTR(function_name, internal_A_handle); THROW_IF_NULLPTR(function_name, B_handle); @@ -70,7 +68,7 @@ void check_valid_spmm_common(const std::string &function_name, "B and C matrices must use the same layout."); } - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::general) { + if (A_view.type_view != matrix_descr::general) { throw mkl::invalid_argument("sparse_blas", function_name, "Matrix view's `type_view` must be `matrix_descr::general`."); } @@ -83,10 +81,8 @@ void check_valid_spmm_common(const std::string &function_name, template void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::transpose /*opA*/, - oneapi::mkl::sparse::matrix_view A_view, - InternalSparseMatHandleT internal_A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, + matrix_view A_view, InternalSparseMatHandleT internal_A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { THROW_IF_NULLPTR(function_name, internal_A_handle); THROW_IF_NULLPTR(function_name, x_handle); @@ -102,12 +98,12 @@ void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::tran "sparse_blas", function_name, "Alpha and beta must both be placed on host memory or device memory."); } - if (A_view.type_view == oneapi::mkl::sparse::matrix_descr::diagonal) { + if (A_view.type_view == matrix_descr::diagonal) { throw mkl::invalid_argument("sparse_blas", function_name, "Matrix view's `type_view` cannot be diagonal."); } - if (A_view.type_view != oneapi::mkl::sparse::matrix_descr::triangular && + if (A_view.type_view != matrix_descr::triangular && A_view.diag_view == oneapi::mkl::diag::unit) { throw mkl::invalid_argument( "sparse_blas", function_name, @@ -116,11 +112,9 @@ void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::tran } template -void check_valid_spsv_common(const std::string &function_name, - oneapi::mkl::sparse::matrix_view A_view, +void check_valid_spsv_common(const std::string &function_name, matrix_view A_view, InternalSparseMatHandleT internal_A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { THROW_IF_NULLPTR(function_name, internal_A_handle); THROW_IF_NULLPTR(function_name, x_handle); diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 5fe2b1ab2..33adf3abb 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -265,19 +265,19 @@ struct generic_sparse_handle { return row_container.data_type; } - void set_matrix_property(oneapi::mkl::sparse::matrix_property property) { + void set_matrix_property(matrix_property property) { properties_mask |= matrix_property_to_mask(property); } - bool has_matrix_property(oneapi::mkl::sparse::matrix_property property) { + bool has_matrix_property(matrix_property property) { return properties_mask & matrix_property_to_mask(property); } private: - std::int32_t matrix_property_to_mask(oneapi::mkl::sparse::matrix_property property) { + std::int32_t matrix_property_to_mask(matrix_property property) { switch (property) { - case oneapi::mkl::sparse::matrix_property::symmetric: return 1 << 0; - case oneapi::mkl::sparse::matrix_property::sorted: return 1 << 1; + case matrix_property::symmetric: return 1 << 0; + case matrix_property::sorted: return 1 << 1; default: throw oneapi::mkl::invalid_argument( "sparse_blas", "set_matrix_property", From 44dc73d8f8f4fdbef79bf06f140c0cc7d4349b4b Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 20 Sep 2024 14:15:30 +0200 Subject: [PATCH 13/43] Remove redundant namespace in MKL backends --- .../backends/mkl_common/mkl_handles.cxx | 56 +++++------ .../backends/mkl_common/mkl_spmm.cxx | 92 ++++++++----------- .../backends/mkl_common/mkl_spmv.cxx | 92 ++++++++----------- .../backends/mkl_common/mkl_spsv.cxx | 81 +++++++--------- 4 files changed, 131 insertions(+), 190 deletions(-) diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx index 7550625eb..8d1a923b6 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx @@ -17,34 +17,32 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + // Dense vector template -void init_dense_vector(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t *p_dvhandle, std::int64_t size, - sycl::buffer val) { - *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size); +void init_dense_vector(sycl::queue & /*queue*/, dense_vector_handle_t *p_dvhandle, + std::int64_t size, sycl::buffer val) { + *p_dvhandle = new dense_vector_handle(val, size); } template -void init_dense_vector(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t *p_dvhandle, std::int64_t size, - fpType *val) { - *p_dvhandle = new oneapi::mkl::sparse::dense_vector_handle(val, size); +void init_dense_vector(sycl::queue & /*queue*/, dense_vector_handle_t *p_dvhandle, + std::int64_t size, fpType *val) { + *p_dvhandle = new dense_vector_handle(val, size); } template -void set_dense_vector_data(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, - sycl::buffer val) { +void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhandle, + std::int64_t size, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dvhandle, true); dvhandle->size = size; dvhandle->set_buffer(val); } template -void set_dense_vector_data(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, std::int64_t size, - fpType *val) { +void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhandle, + std::int64_t size, fpType *val) { detail::check_can_reset_value_handle(__func__, dvhandle, false); dvhandle->size = size; dvhandle->set_usm_ptr(val); @@ -52,34 +50,28 @@ void set_dense_vector_data(sycl::queue & /*queue*/, FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); -sycl::event release_dense_vector(sycl::queue &queue, - oneapi::mkl::sparse::dense_vector_handle_t dvhandle, +sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, const std::vector &dependencies) { return detail::submit_release(queue, dvhandle, dependencies); } // Dense matrix template -void init_dense_matrix(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t *p_dmhandle, +void init_dense_matrix(sycl::queue & /*queue*/, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { - *p_dmhandle = - new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); + *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template -void init_dense_matrix(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t *p_dmhandle, +void init_dense_matrix(sycl::queue & /*queue*/, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType *val) { - *p_dmhandle = - new oneapi::mkl::sparse::dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); + *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template -void set_dense_matrix_data(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dmhandle, true); @@ -91,8 +83,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, } template -void set_dense_matrix_data(sycl::queue & /*queue*/, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType *val) { detail::check_can_reset_value_handle(__func__, dmhandle, false); @@ -105,8 +96,7 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); -sycl::event release_dense_matrix(sycl::queue &queue, - oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, +sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, const std::vector &dependencies) { return detail::submit_release(queue, dmhandle, dependencies); } @@ -286,18 +276,18 @@ sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matri } bool set_matrix_property(sycl::queue & /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle, - oneapi::mkl::sparse::matrix_property property) { + matrix_property property) { auto internal_smhandle = detail::get_internal_handle(smhandle); // Store the matrix property internally for better error checking internal_smhandle->set_matrix_property(property); // Set the matrix property on the backend handle // Backend and oneMKL interface types for the property don't match switch (property) { - case oneapi::mkl::sparse::matrix_property::symmetric: + case matrix_property::symmetric: oneapi::mkl::sparse::set_matrix_property(internal_smhandle->backend_handle, oneapi::mkl::sparse::property::symmetric); return true; - case oneapi::mkl::sparse::matrix_property::sorted: + case matrix_property::sorted: oneapi::mkl::sparse::set_matrix_property(internal_smhandle->backend_handle, oneapi::mkl::sparse::property::sorted); return true; diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx index acde45cb4..ad12edcfb 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx @@ -17,6 +17,8 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spmm_descr { @@ -24,32 +26,30 @@ struct spmm_descr { bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; oneapi::mkl::transpose last_optimized_opB; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_B_handle; - oneapi::mkl::sparse::dense_matrix_handle_t last_optimized_C_handle; - oneapi::mkl::sparse::spmm_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_matrix_handle_t last_optimized_B_handle; + dense_matrix_handle_t last_optimized_C_handle; + spmm_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spmm_descr(sycl::queue & /*queue*/, oneapi::mkl::sparse::spmm_descr_t *p_spmm_descr) { +void init_spmm_descr(sycl::queue & /*queue*/, spmm_descr_t *p_spmm_descr) { *p_spmm_descr = new spmm_descr(); } -sycl::event release_spmm_descr(sycl::queue &queue, oneapi::mkl::sparse::spmm_descr_t spmm_descr, +sycl::event release_spmm_descr(sycl::queue &queue, spmm_descr_t spmm_descr, const std::vector &dependencies) { return detail::submit_release(queue, spmm_descr, dependencies); } void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { + matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { auto internal_A_handle = detail::get_internal_handle(A_handle); detail::check_valid_spmm_common(function_name, A_view, internal_A_handle, B_handle, C_handle, is_alpha_host_accessible, is_beta_host_accessible); @@ -59,7 +59,7 @@ void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose o if ((data_type == detail::data_type::complex_fp32 || data_type == detail::data_type::complex_fp64) && opA == oneapi::mkl::transpose::conjtrans && - internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::symmetric)) { + internal_A_handle->has_matrix_property(matrix_property::symmetric)) { throw mkl::unimplemented( "sparse_blas", function_name, "The backend does not support spmm using conjtrans and the symmetric property."); @@ -70,13 +70,10 @@ void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose o } void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose /*opB*/, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg /*alg*/, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, std::size_t &temp_buffer_size) { + oneapi::mkl::transpose /*opB*/, const void *alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void *beta, + dense_matrix_handle_t C_handle, spmm_alg /*alg*/, spmm_descr_t spmm_descr, + std::size_t &temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -86,12 +83,11 @@ void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, spmm_descr->buffer_size_called = true; } -inline void common_spmm_optimize( - sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr) { +inline void common_spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmm("spmm_optimize", opA, A_view, A_handle, B_handle, C_handle, @@ -111,11 +107,9 @@ inline void common_spmm_optimize( } void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const void *alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { @@ -123,7 +117,7 @@ void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl:: } common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr); - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) { + if (alg == spmm_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -131,13 +125,10 @@ void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl:: } sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, void * /*workspace*/, + oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void * /*workspace*/, const std::vector &dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { @@ -145,7 +136,7 @@ sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, } common_spmm_optimize(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr); - if (alg == oneapi::mkl::sparse::spmm_alg::no_optimize_alg) { + if (alg == spmm_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -154,13 +145,12 @@ sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, } template -sycl::event internal_spmm( - sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void *alpha, - oneapi::mkl::sparse::matrix_view /*A_view*/, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg /*alg*/, - oneapi::mkl::sparse::spmm_descr_t /*spmm_descr*/, const std::vector &dependencies, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { +sycl::event internal_spmm(sycl::queue &queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void *alpha, matrix_view /*A_view*/, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, + const void *beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/, + spmm_descr_t /*spmm_descr*/, const std::vector &dependencies, + bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); T host_beta = @@ -187,11 +177,9 @@ sycl::event internal_spmm( } sycl::event spmm(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, - oneapi::mkl::sparse::dense_matrix_handle_t C_handle, - oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, + const void *alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, + spmm_alg alg, spmm_descr_t spmm_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx index cba197848..1859257e4 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx @@ -17,44 +17,44 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spmv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spmv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spmv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spmv_descr(sycl::queue & /*queue*/, oneapi::mkl::sparse::spmv_descr_t *p_spmv_descr) { +void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) { *p_spmv_descr = new spmv_descr(); } -sycl::event release_spmv_descr(sycl::queue &queue, oneapi::mkl::sparse::spmv_descr_t spmv_descr, +sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, const std::vector &dependencies) { return detail::submit_release(queue, spmv_descr, dependencies); } void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - bool is_alpha_host_accessible, bool is_beta_host_accessible) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, + bool is_beta_host_accessible) { auto internal_A_handle = detail::get_internal_handle(A_handle); detail::check_valid_spmv_common(__func__, opA, A_view, internal_A_handle, x_handle, y_handle, is_alpha_host_accessible, is_beta_host_accessible); - if ((A_view.type_view == oneapi::mkl::sparse::matrix_descr::symmetric || - A_view.type_view == oneapi::mkl::sparse::matrix_descr::hermitian) && + if ((A_view.type_view == matrix_descr::symmetric || + A_view.type_view == matrix_descr::hermitian) && opA == oneapi::mkl::transpose::conjtrans) { throw mkl::unimplemented( "sparse_blas", function_name, @@ -63,12 +63,9 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o } void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg /*alg*/, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/, + spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -79,13 +76,10 @@ void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void } inline void common_spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv("spmv_optimize", opA, A_view, A_handle, x_handle, y_handle, @@ -104,19 +98,16 @@ inline void common_spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, } void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - sycl::buffer /*workspace*/) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, sycl::buffer /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr); - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -135,20 +126,17 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, void * /*workspace*/, - const std::vector &dependencies) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, + void * /*workspace*/, const std::vector &dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spmv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr); - if (alg == oneapi::mkl::sparse::spmv_alg::no_optimize_alg) { + if (alg == spmv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -168,13 +156,10 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const template sycl::event internal_spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg /*alg*/, - oneapi::mkl::sparse::spmv_descr_t /*spmv_descr*/, - const std::vector &dependencies, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t y_handle, spmv_alg /*alg*/, + spmv_descr_t /*spmv_descr*/, const std::vector &dependencies, bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); @@ -223,12 +208,9 @@ sycl::event internal_spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const } sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - const std::vector &dependencies) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx index 01575ac36..56a2491b2 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx @@ -17,44 +17,43 @@ * **************************************************************************/ +// In this file functions and types using the namespace oneapi::mkl::sparse:: refer to the backend's namespace for better readability. + namespace oneapi::mkl::sparse { struct spsv_descr { bool buffer_size_called = false; bool optimized_called = false; oneapi::mkl::transpose last_optimized_opA; - oneapi::mkl::sparse::matrix_view last_optimized_A_view; - oneapi::mkl::sparse::matrix_handle_t last_optimized_A_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_x_handle; - oneapi::mkl::sparse::dense_vector_handle_t last_optimized_y_handle; - oneapi::mkl::sparse::spsv_alg last_optimized_alg; + matrix_view last_optimized_A_view; + matrix_handle_t last_optimized_A_handle; + dense_vector_handle_t last_optimized_x_handle; + dense_vector_handle_t last_optimized_y_handle; + spsv_alg last_optimized_alg; }; } // namespace oneapi::mkl::sparse namespace oneapi::mkl::sparse::BACKEND { -void init_spsv_descr(sycl::queue & /*queue*/, oneapi::mkl::sparse::spsv_descr_t *p_spsv_descr) { +void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { *p_spsv_descr = new spsv_descr(); } -sycl::event release_spsv_descr(sycl::queue &queue, oneapi::mkl::sparse::spsv_descr_t spsv_descr, +sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, const std::vector &dependencies) { return detail::submit_release(queue, spsv_descr, dependencies); } void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose opA, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - bool is_alpha_host_accessible, oneapi::mkl::sparse::spsv_alg alg) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible, spsv_alg alg) { auto internal_A_handle = detail::get_internal_handle(A_handle); detail::check_valid_spsv_common(function_name, A_view, internal_A_handle, x_handle, y_handle, is_alpha_host_accessible); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg && - !internal_A_handle->has_matrix_property(oneapi::mkl::sparse::matrix_property::sorted)) { + if (alg == spsv_alg::no_optimize_alg && + !internal_A_handle->has_matrix_property(matrix_property::sorted)) { throw mkl::unimplemented( "sparse_blas", function_name, "The backend does not support `no_optimize_alg` unless A_handle has the property `matrix_property::sorted`."); @@ -74,12 +73,9 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o } void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, + std::size_t &temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -89,12 +85,9 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void } inline void common_spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr) { + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv("spsv_optimize", opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, alg); @@ -112,18 +105,15 @@ inline void common_spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, } void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) { + if (alg == spsv_alg::no_optimize_alg) { return; } internal_A_handle->can_be_reset = false; @@ -132,19 +122,16 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, void * /*workspace*/, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void * /*workspace*/, const std::vector &dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } common_spsv_optimize(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - if (alg == oneapi::mkl::sparse::spsv_alg::no_optimize_alg) { + if (alg == spsv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); } internal_A_handle->can_be_reset = false; @@ -154,12 +141,9 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const template sycl::event internal_spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg /*alg*/, - oneapi::mkl::sparse::spsv_descr_t /*spsv_descr*/, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg /*alg*/, spsv_descr_t /*spsv_descr*/, const std::vector &dependencies, bool is_alpha_host_accessible) { T host_alpha = @@ -182,11 +166,8 @@ sycl::event internal_spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const } sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, - oneapi::mkl::sparse::matrix_view A_view, - oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, - oneapi::mkl::sparse::dense_vector_handle_t y_handle, - oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, From 050f22af0410d53bcc95eb97080c8414dc10f997 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 23 Sep 2024 12:04:34 +0200 Subject: [PATCH 14/43] Add comments on the assumption made for buffers --- .../backends/cusparse/cusparse_task.hpp | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index e839c5100..0b1deb3f9 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -63,8 +63,13 @@ auto get_int_accessors(sycl::handler &cgh, matrix_handle_t smhandle) { template void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor, CaptureOnlyAcc... capture_only_accessors) { - // Only capture the accessors to ensure the dependencies are properly handled - // The accessors's pointer have already been set to the native container types in previous functions + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; @@ -77,8 +82,13 @@ template void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor, sycl::accessor workspace_placeholder_acc, CaptureOnlyAcc... capture_only_accessors) { - // Only capture the accessors to ensure the dependencies are properly handled - // The accessors's pointer have already been set to the native container types in previous functions + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. cgh.require(workspace_placeholder_acc); cgh.host_task([functor, queue, workspace_placeholder_acc, capture_only_accessors...](sycl::interop_handle ih) { @@ -93,8 +103,13 @@ template void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor functor, const std::vector &dependencies, CaptureOnlyAcc... capture_only_accessors) { - // Only capture the accessors to ensure the dependencies are properly handled - // The accessors's pointer have already been set to the native container types in previous functions + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND cgh.ext_codeplay_enqueue_native_command( [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) { @@ -128,8 +143,13 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, const std::vector &dependencies, sycl::accessor workspace_placeholder_acc, CaptureOnlyAcc... capture_only_accessors) { - // Only capture the accessors to ensure the dependencies are properly handled - // The accessors's pointer have already been set to the native container types in previous functions + // Only capture the accessors to ensure the dependencies are properly + // handled. The accessors's pointer have already been set to the native + // container types in previous functions. This assumes the underlying + // pointer of the buffer does not change. This is not guaranteed by the SYCL + // specification but should be true for all the implementations. This + // assumption avoids the overhead of resetting the pointer of all data + // handles for each enqueued command. #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND cgh.require(workspace_placeholder_acc); cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, From 77d19e8f4da0f1fc7c63d26f54b243831bd295f3 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 23 Sep 2024 12:22:18 +0200 Subject: [PATCH 15/43] Throw unimplemented for some cases with csr_alg3 --- docs/domains/sparse_linear_algebra.rst | 3 ++ .../cusparse/operations/cusparse_spmm.cpp | 37 +++++++++++-------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index acff0380f..11c59407b 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -45,6 +45,9 @@ cuSPARSE backend Currently known limitations: +- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other + than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw + an ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw an ``oneapi::mkl::unimplemented`` exception. - The COO format requires the indices to be sorted by row. See the `cuSPARSE diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index b1678c078..8c8db04eb 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -71,12 +71,21 @@ inline auto get_cuda_spmm_alg(spmm_alg alg) { } } -inline void fallback_alg_if_needed(spmm_alg& alg, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB) { - if (alg == spmm_alg::csr_alg3 && - (opA != oneapi::mkl::transpose::nontrans || opB == oneapi::mkl::transpose::conjtrans)) { - // Avoid warnings printed on std::cerr - alg = spmm_alg::default_alg; +void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, + bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) { + detail::check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); + if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opA` is not `transpose::nontrans`."); + } + if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::conjtrans) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`."); } } @@ -87,9 +96,8 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mk std::size_t& temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible); - fallback_alg_if_needed(alg, opA, opB); + check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler& sc) { auto cu_handle = sc.get_handle(queue); auto cu_a = A_handle->backend_handle; @@ -116,8 +124,8 @@ inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transp matrix_handle_t A_handle, dense_matrix_handle_t B_handle, bool is_beta_host_accessible, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr) { - detail::check_valid_spmm_common("spmm_optimize", A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible); + check_valid_spmm("spmm_optimize", opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); if (!spmm_descr->buffer_size_called) { throw mkl::uninitialized("sparse_blas", "spmm_optimize", "spmm_buffer_size must be called before spmm_optimize."); @@ -168,7 +176,6 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: // cusparseSpMM_preprocess cannot be called if the workspace is empty return; } - fallback_alg_if_needed(alg, opA, opB); auto functor = [=](CusparseScopedContextHandler& sc, sycl::accessor workspace_acc) { auto cu_handle = sc.get_handle(queue); @@ -200,7 +207,6 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, // cusparseSpMM_preprocess cannot be called if the workspace is empty return detail::collapse_dependencies(queue, dependencies); } - fallback_alg_if_needed(alg, opA, opB); auto functor = [=](CusparseScopedContextHandler& sc) { auto cu_handle = sc.get_handle(queue); spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, @@ -217,8 +223,8 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - detail::check_valid_spmm_common(__func__, A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible); + check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } @@ -235,7 +241,6 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize"); CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize"); - fallback_alg_if_needed(alg, opA, opB); auto compute_functor = [=](CusparseScopedContextHandler& sc, void* workspace_ptr) { auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); auto cu_a = A_handle->backend_handle; From d23b24d0abee93518b0df6cb6af13ac21792f77d Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 23 Sep 2024 17:37:29 +0200 Subject: [PATCH 16/43] Introduce sorted_by_rows property --- docs/domains/sparse_linear_algebra.rst | 9 ++- include/oneapi/mkl/sparse_blas/types.hpp | 1 + .../backends/cusparse/cusparse_handles.cpp | 16 ++--- .../backends/cusparse/cusparse_handles.hpp | 27 +++++--- .../cusparse/operations/cusparse_spmm.cpp | 1 + .../cusparse/operations/cusparse_spmv.cpp | 1 + .../cusparse/operations/cusparse_spsv.cpp | 18 ++++-- .../backends/mkl_common/mkl_handles.cxx | 16 ++--- src/sparse_blas/generic_container.hpp | 20 ++++-- .../sparse_blas/include/test_common.hpp | 63 ++++++++++++++----- .../sparse_blas/include/test_spmm.hpp | 40 ++++++------ .../sparse_blas/include/test_spmv.hpp | 59 ++++++++--------- .../sparse_blas/include/test_spsv.hpp | 59 +++++++++-------- .../sparse_blas/source/sparse_spmm_buffer.cpp | 16 ++--- .../sparse_blas/source/sparse_spmm_usm.cpp | 16 ++--- .../sparse_blas/source/sparse_spmv_buffer.cpp | 16 ++--- .../sparse_blas/source/sparse_spmv_usm.cpp | 16 ++--- .../sparse_blas/source/sparse_spsv_buffer.cpp | 14 ++--- .../sparse_blas/source/sparse_spsv_usm.cpp | 14 ++--- 19 files changed, 239 insertions(+), 183 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 11c59407b..80b9b4419 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -45,14 +45,17 @@ cuSPARSE backend Currently known limitations: +- The COO format requires the indices to be sorted by row. See the `cuSPARSE + documentation + `_. Sparse + operations using matrices with the COO format without the property + ``matrix_property::sorted_by_rows`` or ``matrix_property::sorted`` will throw + an ``oneapi::mkl::unimplemented`` exception. - Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw an ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw an ``oneapi::mkl::unimplemented`` exception. -- The COO format requires the indices to be sorted by row. See the `cuSPARSE - documentation - `_. Operation algorithms mapping diff --git a/include/oneapi/mkl/sparse_blas/types.hpp b/include/oneapi/mkl/sparse_blas/types.hpp index d4aea3e88..39256f3c8 100644 --- a/include/oneapi/mkl/sparse_blas/types.hpp +++ b/include/oneapi/mkl/sparse_blas/types.hpp @@ -36,6 +36,7 @@ namespace sparse { enum class matrix_property { symmetric, sorted, + sorted_by_rows, }; enum class spmm_alg { diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index 1909c8d3c..09bade903 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -263,8 +263,8 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols, - nnz, index); + *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); @@ -284,8 +284,8 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, num_rows, num_cols, - nnz, index); + *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); @@ -388,8 +388,8 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols, - nnz, index); + *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); @@ -410,8 +410,8 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, num_rows, num_cols, - nnz, index); + *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp index ac22d33ae..2653d84c1 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp @@ -59,20 +59,33 @@ struct dense_matrix_handle : public detail::generic_dense_matrix_handle { template matrix_handle(cusparseSpMatDescr_t cu_descr, intType* row_ptr, intType* col_ptr, - fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - oneapi::mkl::index_base index) + fpType* value_ptr, detail::sparse_format format, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) : detail::generic_sparse_handle( - cu_descr, row_ptr, col_ptr, value_ptr, num_rows, num_cols, nnz, index) {} + cu_descr, row_ptr, col_ptr, value_ptr, format, num_rows, num_cols, nnz, index) {} template matrix_handle(cusparseSpMatDescr_t cu_descr, const sycl::buffer row_buffer, const sycl::buffer col_buffer, - const sycl::buffer value_buffer, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) - : detail::generic_sparse_handle( - cu_descr, row_buffer, col_buffer, value_buffer, num_rows, num_cols, nnz, index) {} + const sycl::buffer value_buffer, detail::sparse_format format, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, + oneapi::mkl::index_base index) + : detail::generic_sparse_handle(cu_descr, row_buffer, col_buffer, + value_buffer, format, num_rows, + num_cols, nnz, index) {} }; +inline void check_valid_matrix_properties(const std::string& function_name, + matrix_handle_t sm_handle) { + if (sm_handle->format == detail::sparse_format::COO && + !(sm_handle->has_matrix_property(matrix_property::sorted_by_rows) || + sm_handle->has_matrix_property(matrix_property::sorted))) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support unsorted COO format. Use `set_matrix_property` to set the property `matrix_property::sorted_by_rows` or `matrix_property::sorted`"); + } +} + } // namespace oneapi::mkl::sparse #endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 8c8db04eb..116bbffe9 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -77,6 +77,7 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) { detail::check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle, is_alpha_host_accessible, is_beta_host_accessible); + check_valid_matrix_properties(function_name, A_handle); if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) { throw mkl::unimplemented( "sparse_blas", function_name, diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index f75464e91..03cdd15e0 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -73,6 +73,7 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o bool is_beta_host_accessible) { detail::check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, is_beta_host_accessible); + check_valid_matrix_properties(function_name, A_handle); if (A_view.type_view != matrix_descr::general) { throw mkl::unimplemented( "sparse_blas", function_name, diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 5eedeca70..c06335100 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -69,13 +69,20 @@ inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { return CUSPARSE_SPSV_ALG_DEFAULT; } +void check_valid_spsv(const std::string &function_name, matrix_view A_view, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { + detail::check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + check_valid_matrix_properties(function_name, A_handle); +} + void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { auto cu_handle = sc.get_handle(queue); auto cu_a = A_handle->backend_handle; @@ -101,8 +108,8 @@ inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr) { - detail::check_valid_spsv_common("spsv_optimize", A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible); + check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); if (!spsv_descr->buffer_size_called) { throw mkl::uninitialized("sparse_blas", "spsv_optimize", "spsv_buffer_size must be called before spsv_optimize."); @@ -202,8 +209,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - detail::check_valid_spsv_common(__func__, A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx index 8d1a923b6..6ddbd43ef 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx @@ -109,8 +109,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, - num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle( + mkl_handle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), @@ -127,8 +127,8 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p fpType *val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, - num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle( + mkl_handle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); auto event = oneapi::mkl::sparse::set_coo_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), index, row_ind, col_ind, val); @@ -189,8 +189,8 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, - num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle( + mkl_handle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); // The backend deduces nnz from row_ptr. // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast(num_rows), @@ -208,8 +208,8 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p fpType *val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, - num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle( + mkl_handle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); // The backend deduces nnz from row_ptr. auto event = oneapi::mkl::sparse::set_csr_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), index, diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 33adf3abb..09d408a77 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -205,6 +205,8 @@ struct generic_dense_matrix_handle : public detail::generic_dense_handle struct generic_sparse_handle { @@ -214,6 +216,7 @@ struct generic_sparse_handle { generic_container col_container; generic_container value_container; + sparse_format format; std::int64_t num_rows; std::int64_t num_cols; std::int64_t nnz; @@ -223,12 +226,13 @@ struct generic_sparse_handle { template generic_sparse_handle(BackendHandleT backend_handle, intType* row_ptr, intType* col_ptr, - fpType* value_ptr, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, oneapi::mkl::index_base index) + fpType* value_ptr, sparse_format format, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(generic_container(row_ptr)), col_container(generic_container(col_ptr)), value_container(generic_container(value_ptr)), + format(format), num_rows(num_rows), num_cols(num_cols), nnz(nnz), @@ -239,12 +243,14 @@ struct generic_sparse_handle { template generic_sparse_handle(BackendHandleT backend_handle, const sycl::buffer row_buffer, const sycl::buffer col_buffer, - const sycl::buffer value_buffer, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index) + const sycl::buffer value_buffer, sparse_format format, + std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, + oneapi::mkl::index_base index) : backend_handle(backend_handle), row_container(row_buffer), col_container(col_buffer), value_container(value_buffer), + format(format), num_rows(num_rows), num_cols(num_cols), nnz(nnz), @@ -266,6 +272,11 @@ struct generic_sparse_handle { } void set_matrix_property(matrix_property property) { + if (format == sparse_format::CSR && property == matrix_property::sorted_by_rows) { + throw mkl::invalid_argument( + "sparse_blas", "set_matrix_property", + "Property `matrix_property::sorted_by_rows` is not compatible with CSR format."); + } properties_mask |= matrix_property_to_mask(property); } @@ -278,6 +289,7 @@ struct generic_sparse_handle { switch (property) { case matrix_property::symmetric: return 1 << 0; case matrix_property::sorted: return 1 << 1; + case matrix_property::sorted_by_rows: return 1 << 2; default: throw oneapi::mkl::invalid_argument( "sparse_blas", "set_matrix_property", diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp index 892e0969c..3848c5bf1 100644 --- a/tests/unit_tests/sparse_blas/include/test_common.hpp +++ b/tests/unit_tests/sparse_blas/include/test_common.hpp @@ -59,12 +59,39 @@ enum sparse_matrix_format_t { COO, }; -static std::vector> test_matrix_properties{ - { oneapi::mkl::sparse::matrix_property::sorted }, - { oneapi::mkl::sparse::matrix_property::symmetric }, - { oneapi::mkl::sparse::matrix_property::sorted, - oneapi::mkl::sparse::matrix_property::symmetric } -}; +inline std::set get_default_matrix_properties( + sycl::queue queue, sparse_matrix_format_t format) { + auto vendor_id = oneapi::mkl::get_device_id(queue); + if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) { + return { oneapi::mkl::sparse::matrix_property::sorted_by_rows }; + } + return {}; +} + +/// Return the combinations of matrix_properties to test other than the default +inline std::vector> +get_all_matrix_properties_combinations(sycl::queue queue, sparse_matrix_format_t format) { + auto vendor_id = oneapi::mkl::get_device_id(queue); + if (vendor_id == oneapi::mkl::device::nvidiagpu && format == sparse_matrix_format_t::COO) { + // Ensure all the sets have the sorted or sorted_by_rows properties + return { { oneapi::mkl::sparse::matrix_property::sorted }, + { oneapi::mkl::sparse::matrix_property::sorted_by_rows, + oneapi::mkl::sparse::matrix_property::symmetric }, + { oneapi::mkl::sparse::matrix_property::sorted, + oneapi::mkl::sparse::matrix_property::symmetric } }; + } + + std::vector> properties_combinations{ + { oneapi::mkl::sparse::matrix_property::sorted }, + { oneapi::mkl::sparse::matrix_property::symmetric }, + { oneapi::mkl::sparse::matrix_property::sorted, + oneapi::mkl::sparse::matrix_property::symmetric } + }; + if (format == sparse_matrix_format_t::COO) { + properties_combinations.push_back({ oneapi::mkl::sparse::matrix_property::sorted_by_rows }); + } + return properties_combinations; +} void print_error_code(sycl::exception const &e); @@ -332,18 +359,23 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow throw std::runtime_error("Unsupported sparse format"); } -inline bool require_coo_sorted_by_row(sycl::queue queue) { - auto vendor_id = oneapi::mkl::get_device_id(queue); - return vendor_id == oneapi::mkl::device::nvidiagpu; -} - /// Shuffle the 3arrays CSR or COO representation (ia, ja, values) /// of any sparse matrix. /// In CSR format, the elements within a row are shuffled without changing ia. /// In COO format, all the elements are shuffled. template -void shuffle_sparse_matrix(sycl::queue queue, sparse_matrix_format_t format, intType indexing, - intType *ia, intType *ja, fpType *a, intType nnz, std::size_t nrows) { +void shuffle_sparse_matrix_if_needed( + sparse_matrix_format_t format, + const std::set &matrix_properties, intType indexing, + intType *ia, intType *ja, fpType *a, intType nnz, std::size_t nrows) { + const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != + matrix_properties.cend(); + if (is_sorted) { + return; + } + const bool is_sorted_by_rows = + matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted_by_rows) != + matrix_properties.cend(); if (format == sparse_matrix_format_t::CSR) { for (std::size_t i = 0; i < nrows; ++i) { intType nnz_row = ia[i + 1] - ia[i]; @@ -354,9 +386,10 @@ void shuffle_sparse_matrix(sycl::queue queue, sparse_matrix_format_t format, int std::swap(a[q], a[j]); } } + // sorted_by_rows does not impact CSR } else if (format == sparse_matrix_format_t::COO) { - if (require_coo_sorted_by_row(queue)) { + if (is_sorted_by_rows) { std::size_t linear_idx = 0; for (std::size_t i = 0; i < nrows; ++i) { // Count the number of non-zero elements for the given row @@ -386,7 +419,7 @@ void shuffle_sparse_matrix(sycl::queue queue, sparse_matrix_format_t format, int } } else { - throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix", + throw oneapi::mkl::exception("sparse_blas", "shuffle_sparse_matrix_if_needed", "Internal error: unsupported format"); } } diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp index 6188d4268..d47b1732c 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp @@ -65,10 +65,13 @@ void test_helper_with_format_with_transpose( oneapi::mkl::layout col_major = oneapi::mkl::layout::col_major; oneapi::mkl::sparse::spmm_alg default_alg = oneapi::mkl::sparse::spmm_alg::default_alg; oneapi::mkl::sparse::matrix_view default_A_view; - std::set no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + { int m = 4, k = 6, n = 5; int nrows_A = (transpose_A != oneapi::mkl::transpose::nontrans) ? k : m; @@ -84,34 +87,34 @@ void test_helper_with_format_with_transpose( EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, true, + default_alg, default_A_view, default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, true), + default_alg, default_A_view, default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, oneapi::mkl::index_base::one, col_major, transpose_A, transpose_B, - fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, set_fp_value()(2.f, 1.5f), - fp_zero, ldb, ldc, default_alg, default_A_view, no_properties, + fp_zero, ldb, ldc, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default beta @@ -119,42 +122,43 @@ void test_helper_with_format_with_transpose( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, set_fp_value()(3.2f, 1.f), ldb, ldc, default_alg, - default_A_view, no_properties, no_reset_data, no_scalars_on_device), + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_zero, fp_one, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_zero, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldb EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb + 5, ldc, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldc EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc + 6, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test row major layout EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one, - fp_zero, ncols_B, ncols_C, default_alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + fp_zero, ncols_B, ncols_C, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6; @@ -163,19 +167,19 @@ void test_helper_with_format_with_transpose( test_functor_i64(dev, format, long_nrows_A, long_ncols_A, long_ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, long_ldb, long_ldc, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, - ldb, ldc, alg, default_A_view, no_properties, no_reset_data, + ldb, ldc, alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, @@ -197,7 +201,7 @@ void test_helper_with_format_with_transpose( EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, no_properties, no_reset_data, + default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp index f141db893..66af38a7c 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp @@ -63,62 +63,65 @@ void test_helper_with_format_with_transpose( oneapi::mkl::index_base index_zero = oneapi::mkl::index_base::zero; oneapi::mkl::sparse::spmv_alg default_alg = oneapi::mkl::sparse::spmv_alg::default_alg; oneapi::mkl::sparse::matrix_view default_A_view; - std::set no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_one, fp_zero, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, true, + fp_one, fp_zero, default_alg, default_A_view, default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, no_properties, no_reset_data, - true), + fp_one, fp_zero, default_alg, default_A_view, default_properties, + no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, oneapi::mkl::index_base::one, transpose_val, fp_one, fp_zero, default_alg, - default_A_view, no_properties, no_reset_data, no_scalars_on_device), + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, set_fp_value()(2.f, 1.5f), fp_zero, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default beta EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, fp_one, set_fp_value()(3.2f, 1.f), default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_one, default_alg, default_A_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_zero, fp_one, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_zero, default_alg, default_A_view, no_properties, + fp_zero, fp_zero, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i64(dev, format, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one, - fp_zero, default_alg, default_A_view, no_properties, no_reset_data, + fp_zero, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular @@ -126,14 +129,14 @@ void test_helper_with_format_with_transpose( oneapi::mkl::sparse::matrix_descr::triangular); EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, no_properties, + fp_one, fp_zero, default_alg, triangular_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper triangular triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, no_properties, + fp_one, fp_zero, default_alg, triangular_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular unit diagonal @@ -142,54 +145,54 @@ void test_helper_with_format_with_transpose( triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties, + fp_one, fp_zero, default_alg, triangular_unit_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper triangular unit diagonal triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, no_properties, + fp_one, fp_zero, default_alg, triangular_unit_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower symmetric oneapi::mkl::sparse::matrix_view symmetric_view(oneapi::mkl::sparse::matrix_descr::symmetric); EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_one, fp_zero, default_alg, symmetric_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper symmetric symmetric_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_one, fp_zero, default_alg, symmetric_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower hermitian oneapi::mkl::sparse::matrix_view hermitian_view(oneapi::mkl::sparse::matrix_descr::hermitian); EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_one, fp_zero, default_alg, hermitian_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper hermitian hermitian_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, no_properties, no_reset_data, - no_scalars_on_device), + fp_one, fp_zero, default_alg, hermitian_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, - transpose_val, fp_one, fp_zero, alg, default_A_view, no_properties, - no_reset_data, no_scalars_on_device), + transpose_val, fp_one, fp_zero, alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp index bdf9210f8..ca58dfd7a 100644 --- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp @@ -60,76 +60,83 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes oneapi::mkl::sparse::matrix_view default_A_view(oneapi::mkl::sparse::matrix_descr::triangular); oneapi::mkl::sparse::matrix_view upper_A_view(oneapi::mkl::sparse::matrix_descr::triangular); upper_A_view.uplo_view = oneapi::mkl::uplo::upper; - std::set no_properties; bool no_reset_data = false; bool no_scalars_on_device = false; + // Queue is only used to get which matrix_property should be used for the tests. + sycl::queue properties_queue(*dev); + auto default_properties = get_default_matrix_properties(properties_queue, format); + // Basic test - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); - // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, no_properties, true, no_scalars_on_device), + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); + // Reset data + EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, + default_properties, true, no_scalars_on_device), + num_passed, num_skipped); // Test alpha on the device EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, no_properties, no_reset_data, true), + default_alg, default_A_view, default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, oneapi::mkl::index_base::one, - transpose_val, alpha, default_alg, default_A_view, no_properties, + transpose_val, alpha, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular matrix - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, upper_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, + default_alg, upper_A_view, default_properties, no_reset_data, + no_scalars_on_device), + num_passed, num_skipped); // Test lower triangular unit diagonal matrix oneapi::mkl::sparse::matrix_view triangular_unit_A_view( oneapi::mkl::sparse::matrix_descr::triangular); triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, no_properties, no_reset_data, + default_alg, triangular_unit_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular unit diagonal matrix triangular_unit_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, no_properties, no_reset_data, + default_alg, triangular_unit_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, set_fp_value()(2.f, 1.5f), default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i64(dev, format, 15L, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, default_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i64(dev, format, 15L, density_A_matrix, index_zero, transpose_val, alpha, + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), + num_passed, num_skipped); // Test lower no_optimize_alg EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, default_A_view, no_properties, no_reset_data, + no_optimize_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper no_optimize_alg - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, no_optimize_alg, upper_A_view, - no_properties, no_reset_data, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, + no_optimize_alg, upper_A_view, default_properties, no_reset_data, + no_scalars_on_device), + num_passed, num_skipped); // Test matrix properties - for (auto properties : test_matrix_properties) { + for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { // Basic test with matrix properties EXPECT_TRUE_OR_FUTURE_SKIP( test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp index df6fb850b..4a37e8c7c 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp @@ -51,8 +51,6 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, auto [opa_nrows, opa_ncols] = swap_if_transposed(transpose_A, nrows_A, ncols_A); auto [opb_nrows, opb_ncols] = swap_if_transposed(transpose_B, opa_ncols, ncols_C); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -73,10 +71,9 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, std::vector c_ref_host(c_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast(nrows_A)); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -119,10 +116,9 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast(nrows_A)); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 7d30426c4..8070633fc 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -47,8 +47,6 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, auto [opa_nrows, opa_ncols] = swap_if_transposed(transpose_A, nrows_A, ncols_A); auto [opb_nrows, opb_ncols] = swap_if_transposed(transpose_B, opa_ncols, ncols_C); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -69,10 +67,9 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, std::vector c_ref_host(c_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast(nrows_A)); auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); @@ -152,10 +149,9 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast(nrows_A)); if (reset_nnz > nnz) { // Wait before freeing usm pointers ev_spmm.wait_and_throw(); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp index e03c09ebe..f56deaf91 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp @@ -45,8 +45,6 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, } auto [opa_nrows, opa_ncols] = swap_if_transposed(transpose_val, nrows_A, ncols_A); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -66,10 +64,9 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, std::vector y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast(nrows_A)); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -109,10 +106,9 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast(nrows_A)); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index eb54f6a5d..2852a2495 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -41,8 +41,6 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, } auto [opa_nrows, opa_ncols] = swap_if_transposed(transpose_val, nrows_A, ncols_A); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -62,10 +60,9 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, std::vector y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, + static_cast(nrows_A)); auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); @@ -144,10 +141,9 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType reset_nnz = generate_random_matrix( format, nrows_A, ncols_A, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, static_cast(nrows_A)); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, + static_cast(nrows_A)); if (reset_nnz > nnz) { // Wait before freeing usm pointers ev_spmv.wait_and_throw(); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index 3a9d153d6..ebf47fd5e 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -41,8 +41,6 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast(m); - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -72,10 +70,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl std::vector y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, mu); auto ia_buf = make_buffer(ia_host); auto ja_buf = make_buffer(ja_host); @@ -112,10 +108,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl intType reset_nnz = generate_random_matrix( format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, mu); if (reset_nnz > nnz) { ia_buf = make_buffer(ia_host); ja_buf = make_buffer(ja_host); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 6529069f9..03edf8d37 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -37,8 +37,6 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast(m); - const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != - matrix_properties.cend(); const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -68,10 +66,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl std::vector y_ref_host(y_host); // Shuffle ordering of column indices/values to test sortedness - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), nnz, mu); auto ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); auto ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); @@ -143,10 +139,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl intType reset_nnz = generate_random_matrix( format, m, m, density_A_matrix, indexing, ia_host, ja_host, a_host, is_symmetric, require_diagonal); - if (!is_sorted) { - shuffle_sparse_matrix(main_queue, format, indexing, ia_host.data(), ja_host.data(), - a_host.data(), reset_nnz, mu); - } + shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), + ja_host.data(), a_host.data(), reset_nnz, mu); if (reset_nnz > nnz) { // Wait before freeing usm pointers ev_spsv.wait_and_throw(); From 4db2187e21ca99000cbab08cafa768e94e5df14a Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 24 Sep 2024 15:08:23 +0200 Subject: [PATCH 17/43] Avoid placeholder accessor --- .../backends/cusparse/cusparse_task.hpp | 75 +++++++++---------- .../cusparse/operations/cusparse_spmm.cpp | 9 +-- .../cusparse/operations/cusparse_spmv.cpp | 9 +-- .../cusparse/operations/cusparse_spsv.cpp | 4 +- src/sparse_blas/generic_container.hpp | 5 -- 5 files changed, 44 insertions(+), 58 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index 0b1deb3f9..c6f34d49a 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -89,7 +89,6 @@ void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor f // specification but should be true for all the implementations. This // assumption avoids the overhead of resetting the pointer of all data // handles for each enqueued command. - cgh.require(workspace_placeholder_acc); cgh.host_task([functor, queue, workspace_placeholder_acc, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); @@ -151,7 +150,6 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, // assumption avoids the overhead of resetting the pointer of all data // handles for each enqueued command. #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - cgh.require(workspace_placeholder_acc); cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, workspace_placeholder_acc, capture_only_accessors...](sycl::interop_handle ih) { @@ -196,36 +194,36 @@ template &dependencies, Functor functor, matrix_handle_t sm_handle, - sycl::accessor workspace_placeholder_acc, + sycl::buffer workspace_buffer, Ts... other_containers) { if (sm_handle->all_use_buffer()) { detail::data_type value_type = sm_handle->get_value_type(); detail::data_type int_type = sm_handle->get_int_type(); -#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ - return queue.submit([&](sycl::handler &cgh) { \ - cgh.depends_on(dependencies); \ - auto fp_accs = get_fp_accessors(cgh, sm_handle, other_containers...); \ - auto int_accs = get_int_accessors(cgh, sm_handle); \ - if constexpr (UseWorkspace) { \ - if constexpr (UseEnqueueNativeCommandExt) { \ - submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ - workspace_placeholder_acc, fp_accs, int_accs); \ - } \ - else { \ - submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc, fp_accs, \ - int_accs); \ - } \ - } \ - else { \ - (void)workspace_placeholder_acc; \ - if constexpr (UseEnqueueNativeCommandExt) { \ - submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, int_accs); \ - } \ - else { \ - submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ - } \ - } \ +#define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ + return queue.submit([&](sycl::handler &cgh) { \ + cgh.depends_on(dependencies); \ + auto fp_accs = get_fp_accessors(cgh, sm_handle, other_containers...); \ + auto int_accs = get_int_accessors(cgh, sm_handle); \ + auto workspace_acc = workspace_buffer.get_access(cgh); \ + if constexpr (UseWorkspace) { \ + if constexpr (UseEnqueueNativeCommandExt) { \ + submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ + workspace_acc, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, int_accs); \ + } \ + } \ + else { \ + (void)workspace_buffer; \ + if constexpr (UseEnqueueNativeCommandExt) { \ + submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ + } \ + } \ }) #define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE) \ if (int_type == detail::data_type::int32) { \ @@ -318,14 +316,12 @@ sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queu /// Helper function for dispatch_submit_impl_fp_int template sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor, - matrix_handle_t sm_handle, - sycl::accessor workspace_placeholder_acc, + matrix_handle_t sm_handle, sycl::buffer workspace_buffer, Ts... other_containers) { constexpr bool UseWorkspace = true; constexpr bool UseEnqueueNativeCommandExt = false; return dispatch_submit_impl_fp_int( - function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc, - other_containers...); + function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...); } /// Helper function for dispatch_submit_impl_fp_int @@ -335,8 +331,9 @@ sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; constexpr bool UseEnqueueNativeCommandExt = false; + sycl::buffer no_workspace(sycl::range<1>(0)); return dispatch_submit_impl_fp_int( - function_name, queue, dependencies, functor, sm_handle, {}, other_containers...); + function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...); } /// Helper function for dispatch_submit_impl_fp_int @@ -345,15 +342,16 @@ sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; constexpr bool UseEnqueueNativeCommandExt = false; + sycl::buffer no_workspace(sycl::range<1>(0)); return dispatch_submit_impl_fp_int( - function_name, queue, {}, functor, sm_handle, {}, other_containers...); + function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...); } /// Helper function for dispatch_submit_impl_fp_int template sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, Functor functor, matrix_handle_t sm_handle, - sycl::accessor workspace_placeholder_acc, + sycl::buffer workspace_buffer, Ts... other_containers) { constexpr bool UseWorkspace = true; #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND @@ -362,8 +360,7 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q constexpr bool UseEnqueueNativeCommandExt = false; #endif return dispatch_submit_impl_fp_int( - function_name, queue, {}, functor, sm_handle, workspace_placeholder_acc, - other_containers...); + function_name, queue, {}, functor, sm_handle, workspace_buffer, other_containers...); } /// Helper function for dispatch_submit_impl_fp_int @@ -378,8 +375,9 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q #else constexpr bool UseEnqueueNativeCommandExt = false; #endif + sycl::buffer no_workspace(sycl::range<1>(0)); return dispatch_submit_impl_fp_int( - function_name, queue, dependencies, functor, sm_handle, {}, other_containers...); + function_name, queue, dependencies, functor, sm_handle, no_workspace, other_containers...); } /// Helper function for dispatch_submit_impl_fp_int @@ -393,8 +391,9 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q #else constexpr bool UseEnqueueNativeCommandExt = false; #endif + sycl::buffer no_workspace(sycl::range<1>(0)); return dispatch_submit_impl_fp_int( - function_name, queue, {}, functor, sm_handle, {}, other_containers...); + function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...); } } // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 116bbffe9..ca301a1e2 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -185,9 +185,7 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: workspace_ptr, is_alpha_host_accessible); }; - sycl::accessor workspace_placeholder_acc(workspace); - dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, B_handle, - C_handle); + dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle); } sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, @@ -268,10 +266,9 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr auto workspace_ptr = sc.get_mem(workspace_acc); compute_functor(sc, workspace_ptr); }; - sycl::accessor workspace_placeholder_acc( - spmm_descr->workspace.get_buffer()); return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, - workspace_placeholder_acc, B_handle, C_handle); + spmm_descr->workspace.get_buffer(), + B_handle, C_handle); } else { // The same dispatch_submit can be used for USM or buffers if no diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index 03cdd15e0..a0db00d8a 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -181,9 +181,7 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a // The accessor can only be bound to the cgh if the buffer size is // greater than 0 - sycl::accessor workspace_placeholder_acc(workspace); - dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle, - y_handle); + dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { auto functor = [=](CusparseScopedContextHandler &sc) { @@ -284,10 +282,9 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp auto workspace_ptr = sc.get_mem(workspace_acc); compute_functor(sc, workspace_ptr); }; - sycl::accessor workspace_placeholder_acc( - spmv_descr->workspace.get_buffer()); return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, - workspace_placeholder_acc, x_handle, y_handle); + spmv_descr->workspace.get_buffer(), + x_handle, y_handle); } else { // The same dispatch_submit can be used for USM or buffers if no diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index c06335100..4488d1d02 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -168,9 +168,7 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a // The accessor can only be bound to the cgh if the buffer size is // greater than 0 - sycl::accessor workspace_placeholder_acc(workspace); - dispatch_submit(__func__, queue, functor, A_handle, workspace_placeholder_acc, x_handle, - y_handle); + dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { auto functor = [=](CusparseScopedContextHandler &sc) { diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 09d408a77..5fa278497 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -272,11 +272,6 @@ struct generic_sparse_handle { } void set_matrix_property(matrix_property property) { - if (format == sparse_format::CSR && property == matrix_property::sorted_by_rows) { - throw mkl::invalid_argument( - "sparse_blas", "set_matrix_property", - "Property `matrix_property::sorted_by_rows` is not compatible with CSR format."); - } properties_mask |= matrix_property_to_mask(property); } From f73fd0b0df04205a71c1e71eeaddd316f7bc4464 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 26 Sep 2024 15:00:17 +0200 Subject: [PATCH 18/43] Remove description from algorithms tables --- docs/domains/sparse_linear_algebra.rst | 113 ++++++++++--------------- 1 file changed, 45 insertions(+), 68 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 80b9b4419..64f2d0e63 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -76,48 +76,36 @@ spmm :header-rows: 1 :widths: 10 30 45 - * - Value - - Description - - Backend equivalent + * - ``spmm_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE * - ``default_optimize_alg`` - - Default algorithm. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``no_optimize_alg`` - - Default algorithm but may skip some optimizations. Useful only if an - operation with the same configuration is run once. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``coo_alg1`` - - Should provide best performance for COO format, small ``nnz`` and - column-major layout. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG1`` + - none + - ``CUSPARSE_SPMM_COO_ALG1`` * - ``coo_alg2`` - - Should provide best performance for COO format and column-major layout. - Produces deterministic results. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG2`` + - none + - ``CUSPARSE_SPMM_COO_ALG2`` * - ``coo_alg3`` - - Should provide best performance for COO format and large ``nnz``. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG3`` + - none + - ``CUSPARSE_SPMM_COO_ALG3`` * - ``coo_alg4`` - - Should provide best performance for COO format and row-major layout. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_COO_ALG4`` + - none + - ``CUSPARSE_SPMM_COO_ALG4`` * - ``csr_alg1`` - - Should provide best performance for CSR format and column-major layout. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG1`` + - none + - ``CUSPARSE_SPMM_CSR_ALG1`` * - ``csr_alg2`` - - Should provide best performance for CSR format and row-major layout. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG2`` + - none + - ``CUSPARSE_SPMM_CSR_ALG2`` * - ``csr_alg3`` - - Deterministic algorithm for CSR format. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_CSR_ALG3`` + - none + - ``CUSPARSE_SPMM_CSR_ALG3`` spmv @@ -127,38 +115,30 @@ spmv :header-rows: 1 :widths: 10 30 45 - * - Value - - Description - - Backend equivalent + * - ``spmv_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE * - ``default_alg`` - - Default algorithm. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMV_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMV_ALG_DEFAULT`` * - ``no_optimize_alg`` - - Default algorithm but may skip some optimizations. Useful only if an - operation with the same configuration is run once. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``coo_alg1`` - - Default algorithm for COO format. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG1`` + - none + - ``CUSPARSE_SPMV_COO_ALG1`` * - ``coo_alg2`` - - Deterministic algorithm for COO format. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMV_COO_ALG2`` + - none + - ``CUSPARSE_SPMV_COO_ALG2`` * - ``csr_alg1`` - - Default algorithm for CSR format. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG1`` + - none + - ``CUSPARSE_SPMV_CSR_ALG1`` * - ``csr_alg2`` - - Deterministic algorithm for CSR format. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMV_CSR_ALG2`` + - none + - ``CUSPARSE_SPMV_CSR_ALG2`` * - ``csr_alg3`` - - LRB variant of the algorithm for CSR format. - - | MKL: none - | cuSPARSE: none + - none + - none spsv @@ -168,15 +148,12 @@ spsv :header-rows: 1 :widths: 10 30 45 - * - Value - - Description - - Backend equivalent + * - ``spsv_alg`` value + - MKLCPU/MKLGPU + - cuSPARSE * - ``default_optimize_alg`` - - Default algorithm. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``no_optimize_alg`` - - Default algorithm but may skip some optimizations. Useful only if an - operation with the same configuration is run once. - - | MKL: none - | cuSPARSE: ``CUSPARSE_SPMM_ALG_DEFAULT`` + - none + - ``CUSPARSE_SPMM_ALG_DEFAULT`` From 18156e5f1a152cbcfa0fd567e76625286a2e08c3 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 26 Sep 2024 16:26:13 +0200 Subject: [PATCH 19/43] Use COO sorted_by_rows in example --- .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp | 84 ++++++++++--------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp index f3fc5b416..316d2c744 100644 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -60,42 +60,44 @@ // is performed and finally the results are post processed. // template -int run_sparse_matrix_vector_multiply_example(const selectorType &selector) { +int run_sparse_matrix_vector_multiply_example(selectorType &selector) { auto queue = selector.get_queue(); // Matrix data size - intType size = 4; - intType nrows = size * size * size; + static constexpr intType size = 8; // Set scalar fpType values fpType alpha = set_fp_value(fpType(1.0)); fpType beta = set_fp_value(fpType(0.0)); - intType *ia, *ja; - fpType *a, *x, *y, *z; - std::size_t sizea = static_cast(27 * nrows); - std::size_t sizeja = static_cast(27 * nrows); - std::size_t sizeia = static_cast(nrows + 1); - std::size_t sizevec = static_cast(nrows); - - ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), queue); - ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), queue); - a = (fpType *)sycl::malloc_shared(sizea * sizeof(fpType), queue); - x = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); - y = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); - z = (fpType *)sycl::malloc_shared(sizevec * sizeof(fpType), queue); - - if (!ia || !ja || !a || !x || !y || !z) { + intType nnz = 9; + // host_ia must be sorted to maintain the sorted_by_rows property + intType host_ia[] = { 0, 0, 1, 3, 4, 4, 4, 7, 7 }; + intType host_ja[] = { 0, 7, 2, 2, 5, 4, 0, 0, 7 }; + + intType *ia = (intType *)sycl::malloc_shared(nnz * sizeof(intType), queue); + intType *ja = (intType *)sycl::malloc_shared(nnz * sizeof(intType), queue); + fpType *a = (fpType *)sycl::malloc_shared(nnz * sizeof(fpType), queue); + fpType *x = (fpType *)sycl::malloc_shared(size * sizeof(fpType), queue); + fpType *y = (fpType *)sycl::malloc_shared(size * sizeof(fpType), queue); + + if (!ia || !ja || !a || !x || !y) { throw std::runtime_error("Failed to allocate USM memory"); } - intType nnz = generate_sparse_matrix(size, ia, ja, a); + // Copy ia and ja + queue.memcpy(ia, host_ia, nnz * sizeof(intType)).wait_and_throw(); + queue.memcpy(ja, host_ja, nnz * sizeof(intType)).wait_and_throw(); + + // Init matrix values + for (int i = 0; i < nnz; i++) { + a[i] = set_fp_value(fpType(i + 1)); + } // Init vectors x and y - for (int i = 0; i < nrows; i++) { - x[i] = set_fp_value(fpType(1.0)); + for (int i = 0; i < size; i++) { + x[i] = set_fp_value(fpType(i + 1)); y[i] = set_fp_value(fpType(0.0)); - z[i] = set_fp_value(fpType(0.0)); } std::vector int_ptr_vec; @@ -105,7 +107,6 @@ int run_sparse_matrix_vector_multiply_example(const selectorType &selector) { fp_ptr_vec.push_back(a); fp_ptr_vec.push_back(x); fp_ptr_vec.push_back(y); - fp_ptr_vec.push_back(z); // // Execute Matrix Multiply @@ -121,19 +122,23 @@ int run_sparse_matrix_vector_multiply_example(const selectorType &selector) { ? "nontrans" : (transA == oneapi::mkl::transpose::trans ? "trans" : "conjtrans")) << std::endl; - std::cout << "\t\t\tnrows = " << nrows << std::endl; + std::cout << "\t\t\tsize = " << size << std::endl; std::cout << "\t\t\talpha = " << alpha << ", beta = " << beta << std::endl; - // Create and initialize handle for a Sparse Matrix in CSR format + // Create and initialize handle for a Sparse Matrix in COO format sorted by rows oneapi::mkl::sparse::matrix_handle_t A_handle = nullptr; - oneapi::mkl::sparse::init_csr_matrix(selector, &A_handle, nrows, nrows, nnz, + oneapi::mkl::sparse::init_coo_matrix(selector, &A_handle, size, size, nnz, oneapi::mkl::index_base::zero, ia, ja, a); + // cuSPARSE backend requires that the property sorted_by_rows or sorted is set when using matrices in COO format. + // Setting these properties is also the best practice to get best performance. + oneapi::mkl::sparse::set_matrix_property(selector, A_handle, + oneapi::mkl::sparse::matrix_property::sorted_by_rows); // Create and initialize dense vector handles oneapi::mkl::sparse::dense_vector_handle_t x_handle = nullptr; oneapi::mkl::sparse::dense_vector_handle_t y_handle = nullptr; - oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, sizevec, x); - oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, sizevec, y); + oneapi::mkl::sparse::init_dense_vector(selector, &x_handle, size, x); + oneapi::mkl::sparse::init_dense_vector(selector, &y_handle, size, y); // Create operation descriptor oneapi::mkl::sparse::spmv_descr_t descr = nullptr; @@ -172,25 +177,26 @@ int run_sparse_matrix_vector_multiply_example(const selectorType &selector) { // fpType *res = y; + fpType expected_res[size]; const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); - for (intType row = 0; row < nrows; row++) { - z[row] *= beta; + for (intType row = 0; row < size; row++) { + expected_res[row] *= beta; } - for (intType row = 0; row < nrows; row++) { + for (intType row = 0; row < size; row++) { fpType tmp = alpha * x[row]; for (intType i = ia[row]; i < ia[row + 1]; i++) { if constexpr (is_complex()) { - z[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); + expected_res[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); } else { - z[ja[i]] += tmp * a[i]; + expected_res[ja[i]] += tmp * a[i]; } } } bool good = true; - for (intType row = 0; row < nrows; row++) { - good &= check_result(res[row], z[row], nrows, row); + for (intType row = 0; row < size; row++) { + good &= check_result(res[row], expected_res[row], size, row); } std::cout << "\n\t\t sparse::spmv example " << (good ? "passed" : "failed") << "\n\tFinished" @@ -217,7 +223,7 @@ void print_example_banner() { std::cout << "# " << std::endl; std::cout << "# y = alpha * op(A) * x + beta * y" << std::endl; std::cout << "# " << std::endl; - std::cout << "# where A is a sparse matrix in CSR format, x and y are " + std::cout << "# where A is a sparse matrix in COO format, x and y are " "dense vectors" << std::endl; std::cout << "# and alpha, beta are floating point type precision scalars." << std::endl; @@ -256,7 +262,7 @@ int main(int /*argc*/, char ** /*argv*/) { try { sycl::queue cpu_queue(sycl::cpu_selector_v, exception_handler); sycl::queue gpu_queue(sycl::gpu_selector_v, exception_handler); - unsigned int vendor_id = gpu_queue.get_info(); + unsigned int vendor_id = gpu_queue.get_device().get_info(); if (vendor_id != NVIDIA_ID) { std::cerr << "FAILED: NVIDIA GPU device not found" << std::endl; return 1; @@ -265,9 +271,9 @@ int main(int /*argc*/, char ** /*argv*/) { oneapi::mkl::backend_selector gpu_selector{ gpu_queue }; std::cout << "Running Sparse BLAS SPMV USM example on:" << std::endl; - std::cout << "\tCPU device: " << cpu_queue.get_info() + std::cout << "\tCPU device: " << cpu_queue.get_device().get_info() << std::endl; - std::cout << "\tGPU device: " << gpu_queue.get_info() + std::cout << "\tGPU device: " << gpu_queue.get_device().get_info() << std::endl; std::cout << "Running with single precision real data type:" << std::endl; From 5addc1479936302a3624a8aa3b95e95234ed78a1 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 16:25:26 +0200 Subject: [PATCH 20/43] Document missing feature --- docs/domains/sparse_linear_algebra.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 64f2d0e63..71bacf927 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -56,6 +56,9 @@ Currently known limitations: an ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw an ``oneapi::mkl::unimplemented`` exception. +- oneMKL Interface does not provide a way to use non-default algorithms without + calling preprocess functions such as ``cusparseSpMM_preprocess`` or + ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed. Operation algorithms mapping From 6946d64b3bdba86ecc456d5b2c8d15f945dc5f09 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 16:28:26 +0200 Subject: [PATCH 21/43] Remove workaround for alpha and beta spmv --- .../backends/cusparse/operations/cusparse_spmv.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index a0db00d8a..45865688e 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -256,19 +256,9 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp auto cu_op = get_cuda_operation(type, opA); auto cu_type = get_cuda_value_type(type); auto cu_alg = get_cuda_spmv_alg(alg); - // Workaround issue with captured alpha and beta causing a segfault inside cuSPARSE - // Copy alpha and beta locally in the largest data value type and use the local pointer - cuDoubleComplex local_alpha, local_beta; - const void *alpha_ptr = alpha, *beta_ptr = beta; - if (is_alpha_host_accessible) { - local_alpha = *reinterpret_cast(alpha_ptr); - local_beta = *reinterpret_cast(beta_ptr); - alpha_ptr = &local_alpha; - beta_ptr = &local_beta; - } set_pointer_mode(cu_handle, is_alpha_host_accessible); - auto status = cusparseSpMV(cu_handle, cu_op, alpha_ptr, cu_a, cu_x, beta_ptr, cu_y, cu_type, - cu_alg, workspace_ptr); + auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg, + workspace_ptr); check_status(status, __func__); #ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); From 8e77ead50d9069bad0a6ad22b9e826a093f83ecc Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 17:02:19 +0200 Subject: [PATCH 22/43] Reword comment on empty accessors --- .../backends/cusparse/operations/cusparse_spmm.cpp | 3 +-- .../backends/cusparse/operations/cusparse_spmv.cpp | 6 ++---- .../backends/cusparse/operations/cusparse_spsv.cpp | 3 +-- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index ca301a1e2..820f07e38 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -259,8 +259,7 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr #endif }; if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { - // The accessor can only be bound to the cgh if the buffer size is - // greater than 0 + // The accessor can only be created if the buffer size is greater than 0 auto functor_buffer = [=](CusparseScopedContextHandler& sc, sycl::accessor workspace_acc) { auto workspace_ptr = sc.get_mem(workspace_acc); diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index 45865688e..7fe114159 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -179,8 +179,7 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a workspace_ptr, is_alpha_host_accessible); }; - // The accessor can only be bound to the cgh if the buffer size is - // greater than 0 + // The accessor can only be created if the buffer size is greater than 0 dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { @@ -265,8 +264,7 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp #endif }; if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { - // The accessor can only be bound to the cgh if the buffer size is - // greater than 0 + // The accessor can only be created if the buffer size is greater than 0 auto functor_buffer = [=](CusparseScopedContextHandler &sc, sycl::accessor workspace_acc) { auto workspace_ptr = sc.get_mem(workspace_acc); diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 4488d1d02..a2963eb80 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -166,8 +166,7 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a spsv_descr, workspace_ptr, is_alpha_host_accessible); }; - // The accessor can only be bound to the cgh if the buffer size is - // greater than 0 + // The accessor can only be created if the buffer size is greater than 0 dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { From 325f79495c2090b95538e0eaaa80ccf40f2e9737 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 17:03:50 +0200 Subject: [PATCH 23/43] Fix function name in exceptions --- .../backends/cusparse/operations/cusparse_spmm.cpp | 2 +- .../backends/cusparse/operations/cusparse_spmv.cpp | 2 +- .../backends/cusparse/operations/cusparse_spsv.cpp | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 820f07e38..a179ae2f1 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -157,7 +157,7 @@ void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpMM_preprocess(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c, cu_type, cu_alg, workspace_ptr); - check_status(status, "optimize_spmm"); + check_status(status, "spmm_optimize"); } void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index 7fe114159..54621ce7d 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -145,7 +145,7 @@ void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpMV_preprocess(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg, workspace_ptr); - check_status(status, "optimize_spmv"); + check_status(status, "spmv_optimize"); } #endif diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index a2963eb80..cd991afc4 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -132,7 +132,7 @@ void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; auto type = A_handle->value_container.data_type; - set_matrix_attributes("optimize_spsv", cu_a, A_view); + set_matrix_attributes("spsv_optimize", cu_a, A_view); auto cu_op = get_cuda_operation(type, opA); auto cu_type = get_cuda_value_type(type); auto cu_alg = get_cuda_spsv_alg(alg); @@ -140,7 +140,7 @@ void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, cu_descr, workspace_ptr); - check_status(status, "optimize_spsv"); + check_status(status, "spsv_optimize"); } void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, From 9bc77d1548a24b6cf3219c04b8a32e95aeb97218 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 17:17:17 +0200 Subject: [PATCH 24/43] Throw unimplemented for spsv using no_optimize_alg --- docs/domains/sparse_linear_algebra.rst | 2 + .../cusparse/operations/cusparse_spsv.cpp | 45 ++++++++++++++----- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 71bacf927..f1ac8238c 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -56,6 +56,8 @@ Currently known limitations: an ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw an ``oneapi::mkl::unimplemented`` exception. +- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` will throw an + ``oneapi::mkl::unimplemented`` exception. - oneMKL Interface does not provide a way to use non-default algorithms without calling preprocess functions such as ``cusparseSpMM_preprocess`` or ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed. diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index cd991afc4..08fab76f9 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -55,14 +55,34 @@ void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, const std::vector &dependencies) { - // Use dispatch_submit to ensure the backend's descriptor is kept alive as long as the buffers are used - auto functor = [=](CusparseScopedContextHandler &) { + if (!spsv_descr) { + return {}; + } + + auto release_functor = [=]() { CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr); delete spsv_descr; }; - return dispatch_submit(__func__, queue, dependencies, functor, - spsv_descr->last_optimized_A_handle, spsv_descr->last_optimized_x_handle, - spsv_descr->last_optimized_y_handle); + + // Use dispatch_submit to ensure the backend's descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spsv_descr->last_optimized_A_handle && + spsv_descr->last_optimized_A_handle->all_use_buffer() && + spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle) { + auto dispatch_functor = [=](CusparseScopedContextHandler &) { + release_functor(); + }; + return dispatch_submit( + __func__, queue, dependencies, dispatch_functor, spsv_descr->last_optimized_A_handle, + spsv_descr->last_optimized_x_handle, spsv_descr->last_optimized_y_handle); + } + + // Release used if USM is used or the descriptor has been released before spsv_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; } inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { @@ -71,10 +91,15 @@ inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { void check_valid_spsv(const std::string &function_name, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { + dense_vector_handle_t y_handle, spsv_alg alg, bool is_alpha_host_accessible) { detail::check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); check_valid_matrix_properties(function_name, A_handle); + if (alg == spsv_alg::no_optimize_alg) { + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support the algorithm ``spsv_alg::no_optimize_alg``."); + } } void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, @@ -82,7 +107,7 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, alg, is_alpha_host_accessible); auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { auto cu_handle = sc.get_handle(queue); auto cu_a = A_handle->backend_handle; @@ -108,7 +133,7 @@ inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr) { - check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, + check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, alg, is_alpha_host_accessible); if (!spsv_descr->buffer_size_called) { throw mkl::uninitialized("sparse_blas", "spsv_optimize", @@ -153,7 +178,6 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE // Copy the buffer to extend its lifetime until the descriptor is free'd. spsv_descr->workspace.set_buffer_untyped(workspace); @@ -191,7 +215,6 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); - // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE auto functor = [=](CusparseScopedContextHandler &sc) { auto cu_handle = sc.get_handle(queue); spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, @@ -206,7 +229,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, alg, is_alpha_host_accessible); if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } From 710b80b5395d8d8929171f5e32c8444113ca5010 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 17:18:56 +0200 Subject: [PATCH 25/43] Fix documentation typo --- docs/domains/sparse_linear_algebra.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index f1ac8238c..b3ed1493e 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -84,7 +84,7 @@ spmm * - ``spmm_alg`` value - MKLCPU/MKLGPU - cuSPARSE - * - ``default_optimize_alg`` + * - ``default_alg`` - none - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``no_optimize_alg`` @@ -156,7 +156,7 @@ spsv * - ``spsv_alg`` value - MKLCPU/MKLGPU - cuSPARSE - * - ``default_optimize_alg`` + * - ``default_alg`` - none - ``CUSPARSE_SPMM_ALG_DEFAULT`` * - ``no_optimize_alg`` From 00e5cedf62a20dbe016241b4aecfd8c3fd1b9479 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 17:25:14 +0200 Subject: [PATCH 26/43] an -> a --- docs/domains/sparse_linear_algebra.rst | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index b3ed1493e..9d8a01864 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -20,21 +20,21 @@ Currently known limitations: - ``oneapi::mkl::sparse::set_csr_data`` and ``oneapi::mkl::sparse::set_coo_data`` functions cannot be used on a handle that has already been used for an operation or its optimize function. Doing so - will throw an ``oneapi::mkl::unimplemented`` exception. + will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spsv`` with the ``oneapi::mkl::sparse::spsv_alg::no_optimize_alg`` and a sparse matrix that does not have the - ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw an + ``oneapi::mkl::sparse::matrix_property::sorted`` property will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spmm`` on Intel GPU with a sparse matrix that is ``oneapi::mkl::transpose::conjtrans`` and has the - ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw an + ``oneapi::mkl::sparse::matrix_property::symmetric`` property will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a sparse matrix that is ``oneapi::mkl::transpose::conjtrans`` with a ``type_view`` - ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw an + ``matrix_descr::symmetric`` or ``matrix_descr::hermitian`` will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spsv`` on Intel GPU with a sparse matrix that is - ``oneapi::mkl::transpose::conjtrans`` and will throw an + ``oneapi::mkl::transpose::conjtrans`` and will throw a ``oneapi::mkl::unimplemented`` exception. - Scalar parameters ``alpha`` and ``beta`` should be host pointers to prevent synchronizations and copies to the host. @@ -50,13 +50,13 @@ Currently known limitations: `_. Sparse operations using matrices with the COO format without the property ``matrix_property::sorted_by_rows`` or ``matrix_property::sorted`` will throw - an ``oneapi::mkl::unimplemented`` exception. + a ``oneapi::mkl::unimplemented`` exception. - Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw - an ``oneapi::mkl::unimplemented`` exception. + a ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will - throw an ``oneapi::mkl::unimplemented`` exception. -- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` will throw an + throw a ``oneapi::mkl::unimplemented`` exception. +- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` will throw a ``oneapi::mkl::unimplemented`` exception. - oneMKL Interface does not provide a way to use non-default algorithms without calling preprocess functions such as ``cusparseSpMM_preprocess`` or From 342380ea5a64a27705faf6367bac25ea52114208 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 30 Sep 2024 19:39:34 +0200 Subject: [PATCH 27/43] Revert throwing unsupported for spsv + no_optimize_alg --- docs/domains/sparse_linear_algebra.rst | 4 ++-- .../cusparse/operations/cusparse_spsv.cpp | 15 ++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 9d8a01864..c20651f31 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -56,8 +56,8 @@ Currently known limitations: a ``oneapi::mkl::unimplemented`` exception. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw a ``oneapi::mkl::unimplemented`` exception. -- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` will throw a - ``oneapi::mkl::unimplemented`` exception. +- Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` may still + perform some mandatory preprocessing. - oneMKL Interface does not provide a way to use non-default algorithms without calling preprocess functions such as ``cusparseSpMM_preprocess`` or ``cusparseSpMV_preprocess``. Feel free to create an issue if this is needed. diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 08fab76f9..a36ae1c6b 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -91,15 +91,10 @@ inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { void check_valid_spsv(const std::string &function_name, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - dense_vector_handle_t y_handle, spsv_alg alg, bool is_alpha_host_accessible) { + dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { detail::check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); check_valid_matrix_properties(function_name, A_handle); - if (alg == spsv_alg::no_optimize_alg) { - throw mkl::unimplemented( - "sparse_blas", function_name, - "The backend does not support the algorithm ``spsv_alg::no_optimize_alg``."); - } } void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, @@ -107,7 +102,7 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, alg, is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { auto cu_handle = sc.get_handle(queue); auto cu_a = A_handle->backend_handle; @@ -133,7 +128,7 @@ inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr) { - check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, alg, + check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); if (!spsv_descr->buffer_size_called) { throw mkl::uninitialized("sparse_blas", "spsv_optimize", @@ -178,6 +173,7 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE // Copy the buffer to extend its lifetime until the descriptor is free'd. spsv_descr->workspace.set_buffer_untyped(workspace); @@ -215,6 +211,7 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); + // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE auto functor = [=](CusparseScopedContextHandler &sc) { auto cu_handle = sc.get_handle(queue); spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, @@ -229,7 +226,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector &dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, alg, is_alpha_host_accessible); + check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } From 96b38fd1cbde08f1128983fd62270f00e2c4b163 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 1 Oct 2024 17:52:00 +0200 Subject: [PATCH 28/43] Fix documentation enums --- docs/domains/sparse_linear_algebra.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index c20651f31..01eccb041 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -128,7 +128,7 @@ spmv - ``CUSPARSE_SPMV_ALG_DEFAULT`` * - ``no_optimize_alg`` - none - - ``CUSPARSE_SPMM_ALG_DEFAULT`` + - ``CUSPARSE_SPMV_ALG_DEFAULT`` * - ``coo_alg1`` - none - ``CUSPARSE_SPMV_COO_ALG1`` @@ -143,7 +143,7 @@ spmv - ``CUSPARSE_SPMV_CSR_ALG2`` * - ``csr_alg3`` - none - - none + - ``CUSPARSE_SPMV_ALG_DEFAULT`` spsv @@ -158,7 +158,7 @@ spsv - cuSPARSE * - ``default_alg`` - none - - ``CUSPARSE_SPMM_ALG_DEFAULT`` + - ``CUSPARSE_SPSV_ALG_DEFAULT`` * - ``no_optimize_alg`` - none - - ``CUSPARSE_SPMM_ALG_DEFAULT`` + - ``CUSPARSE_SPSV_ALG_DEFAULT`` From c5ba2c49f7cedd52b1d5fe931b20953f97b2ef5d Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 13 Sep 2024 15:20:01 +0200 Subject: [PATCH 29/43] Do not retrieve global handle for set_*_data functions using buffer API --- src/sparse_blas/backends/cusparse/cusparse_handles.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index 09bade903..6aa9709eb 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -73,8 +73,6 @@ void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, s auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); auto cuda_value_type = CudaEnumType::value; @@ -177,8 +175,6 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); @@ -302,8 +298,6 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); @@ -428,8 +422,6 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); From 1f5c80c41a5765f32818f022e61bf4d340bd24c7 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 13 Sep 2024 15:20:43 +0200 Subject: [PATCH 30/43] Remove host_task for set_*_data functions using USM API --- .../backends/cusparse/cusparse_handles.cpp | 163 +++++++----------- 1 file changed, 66 insertions(+), 97 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index 6aa9709eb..1558be92c 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -94,24 +94,17 @@ template void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, fpType *val) { detail::check_can_reset_value_handle(__func__, dvhandle, false); - auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); - if (dvhandle->size != size) { - CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; - CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val, - cuda_value_type); - dvhandle->size = size; - } - else { - CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val); - } - dvhandle->set_usm_ptr(val); - }); - }); - event.wait_and_throw(); + if (dvhandle->size != size) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val, + cuda_value_type); + dvhandle->size = size; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, val); + } + dvhandle->set_usm_ptr(val); } FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); @@ -202,29 +195,22 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, fpType *val) { detail::check_can_reset_value_handle(__func__, dmhandle, false); - auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); - if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || - dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { - CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; - auto cuda_order = get_cuda_order(dense_layout); - CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, - num_cols, ld, val, cuda_value_type, cuda_order); - dmhandle->num_rows = num_rows; - dmhandle->num_cols = num_cols; - dmhandle->ld = ld; - dmhandle->dense_layout = dense_layout; - } - else { - CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val); - } - dmhandle->set_usm_ptr(val); - }); - }); - event.wait_and_throw(); + if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || + dmhandle->dense_layout != dense_layout) { + CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); + auto cuda_value_type = CudaEnumType::value; + auto cuda_order = get_cuda_order(dense_layout); + CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, num_cols, ld, + val, cuda_value_type, cuda_order); + dmhandle->num_rows = num_rows; + dmhandle->num_cols = num_cols; + dmhandle->ld = ld; + dmhandle->dense_layout = dense_layout; + } + else { + CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, val); + } + dmhandle->set_usm_ptr(val); } FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); @@ -330,34 +316,25 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, fpType *val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); - auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); - if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || - smhandle->nnz != nnz || smhandle->index != index) { - CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; - CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, - nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, - cuda_value_type); - smhandle->num_rows = num_rows; - smhandle->num_cols = num_cols; - smhandle->nnz = nnz; - smhandle->index = index; - } - else { - CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, - col_ind, val); - } - smhandle->row_container.set_usm_ptr(row_ind); - smhandle->col_container.set_usm_ptr(col_ind); - smhandle->value_container.set_usm_ptr(val); - }); - }); - event.wait_and_throw(); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || + smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, nnz, + row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, row_ind, col_ind, val); + } + smhandle->row_container.set_usm_ptr(row_ind); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); } FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); @@ -454,34 +431,26 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, fpType *val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); - auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { - // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); - if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || - smhandle->nnz != nnz || smhandle->index != index) { - CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; - CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, - nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, - cuda_index_base, cuda_value_type); - smhandle->num_rows = num_rows; - smhandle->num_cols = num_cols; - smhandle->nnz = nnz; - smhandle->index = index; - } - else { - CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, - col_ind, val); - } - smhandle->row_container.set_usm_ptr(row_ptr); - smhandle->col_container.set_usm_ptr(col_ind); - smhandle->value_container.set_usm_ptr(val); - }); - }); - event.wait_and_throw(); + if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || + smhandle->index != index) { + CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); + auto cuda_index_type = CudaIndexEnumType::value; + auto cuda_index_base = get_cuda_index_base(index); + auto cuda_value_type = CudaEnumType::value; + CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, nnz, + row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, + cuda_value_type); + smhandle->num_rows = num_rows; + smhandle->num_cols = num_cols; + smhandle->nnz = nnz; + smhandle->index = index; + } + else { + CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, row_ptr, col_ind, val); + } + smhandle->row_container.set_usm_ptr(row_ptr); + smhandle->col_container.set_usm_ptr(col_ind); + smhandle->value_container.set_usm_ptr(val); } FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); From 58f08c938cc1e4fb1c43991040a6a02502654f09 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Tue, 22 Oct 2024 17:45:57 +0200 Subject: [PATCH 31/43] Cache CUstream and cusparseHandle_t in operation descriptor --- .../backends/cusparse/cusparse_handles.cpp | 86 +++++++++---------- .../cusparse/cusparse_scope_handle.hpp | 16 ++-- .../backends/cusparse/cusparse_task.hpp | 12 +-- .../cusparse/operations/cusparse_spmm.cpp | 39 +++++---- .../cusparse/operations/cusparse_spmv.cpp | 44 ++++++---- .../cusparse/operations/cusparse_spsv.cpp | 37 +++++--- 6 files changed, 129 insertions(+), 105 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index 1558be92c..cd907fb6b 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -37,12 +37,12 @@ void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, st sycl::buffer val) { auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_value_type = CudaEnumType::value; cusparseDnVecDescr_t cu_dvhandle; - CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, sc.get_mem(acc), + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, get_mem(ih, acc), cuda_value_type); *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); }); @@ -54,9 +54,9 @@ template void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, fpType *val) { auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_value_type = CudaEnumType::value; cusparseDnVecDescr_t cu_dvhandle; CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type); @@ -72,17 +72,17 @@ void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, s detail::check_can_reset_value_handle(__func__, dvhandle, true); auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); auto cuda_value_type = CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, - sc.get_mem(acc), cuda_value_type); + get_mem(ih, acc), cuda_value_type); dvhandle->size = size; } else { CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, - sc.get_mem(acc)); + get_mem(ih, acc)); } dvhandle->set_buffer(val); }); @@ -91,7 +91,7 @@ void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, s } template -void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, +void set_dense_vector_data(sycl::queue &, dense_vector_handle_t dvhandle, std::int64_t size, fpType *val) { detail::check_can_reset_value_handle(__func__, dvhandle, false); if (dvhandle->size != size) { @@ -112,7 +112,7 @@ FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, const std::vector &dependencies) { // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used - auto functor = [=](CusparseScopedContextHandler &) { + auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); delete dvhandle; }; @@ -126,14 +126,14 @@ void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, st sycl::buffer val) { auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_value_type = CudaEnumType::value; auto cuda_order = get_cuda_order(dense_layout); cusparseDnMatDescr_t cu_dmhandle; CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, - sc.get_mem(acc), cuda_value_type, cuda_order); + get_mem(ih, acc), cuda_value_type, cuda_order); *p_dmhandle = new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); }); @@ -145,9 +145,9 @@ template void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType *val) { auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_value_type = CudaEnumType::value; auto cuda_order = get_cuda_order(dense_layout); cusparseDnMatDescr_t cu_dmhandle; @@ -167,14 +167,14 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, detail::check_can_reset_value_handle(__func__, dmhandle, true); auto event = queue.submit([&](sycl::handler &cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); auto cuda_value_type = CudaEnumType::value; auto cuda_order = get_cuda_order(dense_layout); CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, - num_cols, ld, sc.get_mem(acc), cuda_value_type, cuda_order); + num_cols, ld, get_mem(ih, acc), cuda_value_type, cuda_order); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -182,7 +182,7 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, } else { CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, - sc.get_mem(acc)); + get_mem(ih, acc)); } dmhandle->set_buffer(val); }); @@ -191,9 +191,9 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, } template -void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, - std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, - oneapi::mkl::layout dense_layout, fpType *val) { +void set_dense_matrix_data(sycl::queue &, dense_matrix_handle_t dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, + fpType *val) { detail::check_can_reset_value_handle(__func__, dmhandle, false); if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { @@ -218,7 +218,7 @@ FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, const std::vector &dependencies) { // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used - auto functor = [=](CusparseScopedContextHandler &) { + auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); delete dmhandle; }; @@ -235,15 +235,15 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_index_type = CudaIndexEnumType::value; auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, - sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), + get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), cuda_index_type, cuda_index_base, cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); @@ -257,9 +257,9 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, fpType *val) { auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_index_type = CudaIndexEnumType::value; auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; @@ -283,7 +283,7 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); @@ -291,8 +291,8 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, - nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), - sc.get_mem(val_acc), cuda_index_type, cuda_index_base, + nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), + get_mem(ih, val_acc), cuda_index_type, cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; @@ -301,7 +301,7 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } else { CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, - sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc)); + get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc)); } smhandle->row_container.set_buffer(row_ind); smhandle->col_container.set_buffer(col_ind); @@ -312,7 +312,7 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } template -void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, +void set_coo_matrix_data(sycl::queue &, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, fpType *val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); @@ -349,15 +349,15 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_index_type = CudaIndexEnumType::value; auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, - sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc), + get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); @@ -371,9 +371,9 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, fpType *val) { auto event = queue.submit([&](sycl::handler &cgh) { - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - sc.get_handle(queue); + CusparseScopedContextHandler(queue, ih).get_handle(queue); auto cuda_index_type = CudaIndexEnumType::value; auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; @@ -398,7 +398,7 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](CusparseScopedContextHandler &sc) { + submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); @@ -406,8 +406,8 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 auto cuda_index_base = get_cuda_index_base(index); auto cuda_value_type = CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, - nnz, sc.get_mem(row_acc), sc.get_mem(col_acc), - sc.get_mem(val_acc), cuda_index_type, cuda_index_type, + nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), + get_mem(ih, val_acc), cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; @@ -416,7 +416,7 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } else { CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, - sc.get_mem(row_acc), sc.get_mem(col_acc), sc.get_mem(val_acc)); + get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc)); } smhandle->row_container.set_buffer(row_ptr); smhandle->col_container.set_buffer(col_ind); @@ -427,7 +427,7 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } template -void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, +void set_csr_matrix_data(sycl::queue &, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, fpType *val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); @@ -458,7 +458,7 @@ FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, const std::vector &dependencies) { // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used - auto functor = [=](CusparseScopedContextHandler &) { + auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); delete smhandle; }; diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp index 0ad3c401a..d872cbab3 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -73,16 +73,16 @@ class CusparseScopedContextHandler { /// See get_handle_and_stream cusparseHandle_t get_handle(const sycl::queue &queue); - - // Get the native pointer from an accessor. This is a different pointer than - // what can be retrieved with get_multi_ptr. - template - inline void *get_mem(AccT acc) { - auto cudaPtr = ih.get_native_mem(acc); - return reinterpret_cast(cudaPtr); - } }; +// Get the native pointer from an accessor. This is a different pointer than +// what can be retrieved with get_multi_ptr. +template +inline void *get_mem(sycl::interop_handle ih, AccT acc) { + auto cudaPtr = ih.get_native_mem(acc); + return reinterpret_cast(cudaPtr); +} + } // namespace oneapi::mkl::sparse::cusparse #endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index c6f34d49a..4c187c3db 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -73,8 +73,7 @@ void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor, cgh.host_task([functor, queue, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; - auto sc = CusparseScopedContextHandler(queue, ih); - functor(sc); + functor(ih); }); } @@ -93,8 +92,7 @@ void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor f capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; - auto sc = CusparseScopedContextHandler(queue, ih); - functor(sc, workspace_placeholder_acc); + functor(ih, workspace_placeholder_acc); }); } @@ -114,7 +112,6 @@ void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor f [functor, queue, dependencies, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; - auto sc = CusparseScopedContextHandler(queue, ih); // The functor using ext_codeplay_enqueue_native_command need to // explicitly wait on the events for the SPARSE domain. The // extension ext_codeplay_enqueue_native_command is used to launch @@ -129,7 +126,7 @@ void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor f for (auto event : dependencies) { event.wait(); } - functor(sc); + functor(ih); }); #else (void)dependencies; @@ -155,7 +152,6 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; - auto sc = CusparseScopedContextHandler(queue, ih); // The functor using ext_codeplay_enqueue_native_command need to // explicitly wait on the events for the SPARSE domain. The // extension ext_codeplay_enqueue_native_command is used to launch @@ -170,7 +166,7 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, for (auto event : dependencies) { event.wait(); } - functor(sc, workspace_placeholder_acc); + functor(ih, workspace_placeholder_acc); }); #else (void)dependencies; diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index a179ae2f1..1b1933693 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -32,6 +32,12 @@ namespace oneapi::mkl::sparse { // Complete the definition of the incomplete type struct spmm_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + detail::generic_container workspace; std::size_t temp_buffer_size = 0; bool buffer_size_called = false; @@ -99,8 +105,11 @@ void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mk bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, is_alpha_host_accessible, is_beta_host_accessible, alg); - auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler& sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmm_descr->cu_handle = cu_handle; + spmm_descr->cu_stream = cu_stream; auto cu_a = A_handle->backend_handle; auto cu_b = B_handle->backend_handle; auto cu_c = C_handle->backend_handle; @@ -177,10 +186,9 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: // cusparseSpMM_preprocess cannot be called if the workspace is empty return; } - auto functor = [=](CusparseScopedContextHandler& sc, - sycl::accessor workspace_acc) { - auto cu_handle = sc.get_handle(queue); - auto workspace_ptr = sc.get_mem(workspace_acc); + auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { + auto cu_handle = spmm_descr->cu_handle; + auto workspace_ptr = get_mem(ih, workspace_acc); spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, workspace_ptr, is_alpha_host_accessible); }; @@ -206,8 +214,8 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, // cusparseSpMM_preprocess cannot be called if the workspace is empty return detail::collapse_dependencies(queue, dependencies); } - auto functor = [=](CusparseScopedContextHandler& sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmm_descr->cu_handle; spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, workspace, is_alpha_host_accessible); }; @@ -240,8 +248,8 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize"); CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize"); - auto compute_functor = [=](CusparseScopedContextHandler& sc, void* workspace_ptr) { - auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto compute_functor = [=](void* workspace_ptr) { + auto cu_handle = spmm_descr->cu_handle; auto cu_a = A_handle->backend_handle; auto cu_b = B_handle->backend_handle; auto cu_c = C_handle->backend_handle; @@ -255,15 +263,16 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr cu_type, cu_alg, workspace_ptr); check_status(status, __func__); #ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + auto cu_stream = spmm_descr->cu_stream; CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); #endif }; if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 - auto functor_buffer = [=](CusparseScopedContextHandler& sc, + auto functor_buffer = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { - auto workspace_ptr = sc.get_mem(workspace_acc); - compute_functor(sc, workspace_ptr); + auto workspace_ptr = get_mem(ih, workspace_acc); + compute_functor(workspace_ptr); }; return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, spmm_descr->workspace.get_buffer(), @@ -274,8 +283,8 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr // workspace accessor is needed, workspace_ptr will be a nullptr in the // latter case. auto workspace_ptr = spmm_descr->workspace.usm_ptr; - auto functor_usm = [=](CusparseScopedContextHandler& sc) { - compute_functor(sc, workspace_ptr); + auto functor_usm = [=](sycl::interop_handle) { + compute_functor(workspace_ptr); }; return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, B_handle, C_handle); diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index 54621ce7d..a804574eb 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -32,6 +32,12 @@ namespace oneapi::mkl::sparse { // Complete the definition of the incomplete type struct spmv_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + detail::generic_container workspace; std::size_t temp_buffer_size = 0; bool buffer_size_called = false; @@ -89,8 +95,12 @@ void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, is_beta_host_accessible); - auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmv_descr->cu_handle = cu_handle; + spmv_descr->cu_stream = cu_stream; auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; @@ -171,10 +181,9 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a return; #else if (spmv_descr->temp_buffer_size > 0) { - auto functor = [=](CusparseScopedContextHandler &sc, - sycl::accessor workspace_acc) { - auto cu_handle = sc.get_handle(queue); - auto workspace_ptr = sc.get_mem(workspace_acc); + auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { + auto cu_handle = spmv_descr->cu_handle; + auto workspace_ptr = get_mem(ih, workspace_acc); spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, workspace_ptr, is_alpha_host_accessible); }; @@ -183,8 +192,8 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { - auto functor = [=](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmv_descr->cu_handle; spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, nullptr, is_alpha_host_accessible); }; @@ -214,8 +223,8 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) return detail::collapse_dependencies(queue, dependencies); #else - auto functor = [=](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spmv_descr->cu_handle; spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, workspace, is_alpha_host_accessible); }; @@ -246,8 +255,8 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize"); CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize"); - auto compute_functor = [=](CusparseScopedContextHandler &sc, void *workspace_ptr) { - auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto compute_functor = [=](void *workspace_ptr) { + auto cu_handle = spmv_descr->cu_handle; auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; @@ -260,15 +269,16 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp workspace_ptr); check_status(status, __func__); #ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + auto cu_stream = spmv_descr->cu_stream; CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); #endif }; if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 - auto functor_buffer = [=](CusparseScopedContextHandler &sc, + auto functor_buffer = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { - auto workspace_ptr = sc.get_mem(workspace_acc); - compute_functor(sc, workspace_ptr); + auto workspace_ptr = get_mem(ih, workspace_acc); + compute_functor(workspace_ptr); }; return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, spmv_descr->workspace.get_buffer(), @@ -279,8 +289,8 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp // workspace accessor is needed, workspace_ptr will be a nullptr in the // latter case. auto workspace_ptr = spmv_descr->workspace.usm_ptr; - auto functor_usm = [=](CusparseScopedContextHandler &sc) { - compute_functor(sc, workspace_ptr); + auto functor_usm = [=](sycl::interop_handle) { + compute_functor(workspace_ptr); }; return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, x_handle, y_handle); diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index a36ae1c6b..73c435bc8 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -32,6 +32,12 @@ namespace oneapi::mkl::sparse { // Complete the definition of the incomplete type struct spsv_descr { + // Cache the CUstream and global handle to avoid relying on CusparseScopedContextHandler to retrieve them. + // cuSPARSE seem to implicitly require to use the same CUstream for a whole operation (buffer_size, optimization and computation steps). + // This is needed as the default SYCL queue is out-of-order which can have a different CUstream for each host_task or native_command. + CUstream cu_stream; + cusparseHandle_t cu_handle; + cusparseSpSVDescr_t cu_descr; detail::generic_container workspace; bool buffer_size_called = false; @@ -69,7 +75,7 @@ sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, if (spsv_descr->last_optimized_A_handle && spsv_descr->last_optimized_A_handle->all_use_buffer() && spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle) { - auto dispatch_functor = [=](CusparseScopedContextHandler &) { + auto dispatch_functor = [=](sycl::interop_handle) { release_functor(); }; return dispatch_submit( @@ -77,7 +83,7 @@ sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, spsv_descr->last_optimized_x_handle, spsv_descr->last_optimized_y_handle); } - // Release used if USM is used or the descriptor has been released before spsv_optimize has succeeded + // Release used if USM is used or if the descriptor has been released before spsv_optimize has succeeded sycl::event event = queue.submit([&](sycl::handler &cgh) { cgh.depends_on(dependencies); cgh.host_task(release_functor); @@ -103,8 +109,11 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void std::size_t &temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); - auto functor = [=, &temp_buffer_size](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spsv_descr->cu_handle = cu_handle; + spsv_descr->cu_stream = cu_stream; auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; @@ -178,10 +187,9 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a spsv_descr->workspace.set_buffer_untyped(workspace); if (workspace.size() > 0) { - auto functor = [=](CusparseScopedContextHandler &sc, - sycl::accessor workspace_acc) { - auto cu_handle = sc.get_handle(queue); - auto workspace_ptr = sc.get_mem(workspace_acc); + auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { + auto cu_handle = spsv_descr->cu_handle; + auto workspace_ptr = get_mem(ih, workspace_acc); spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, workspace_ptr, is_alpha_host_accessible); }; @@ -190,8 +198,8 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { - auto functor = [=](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, nullptr, is_alpha_host_accessible); }; @@ -212,8 +220,8 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, spsv_descr); // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE - auto functor = [=](CusparseScopedContextHandler &sc) { - auto cu_handle = sc.get_handle(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, workspace, is_alpha_host_accessible); }; @@ -242,8 +250,8 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize"); CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize"); - auto functor = [=](CusparseScopedContextHandler &sc) { - auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + auto functor = [=](sycl::interop_handle) { + auto cu_handle = spsv_descr->cu_handle; auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; @@ -258,6 +266,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp cu_descr); check_status(status, __func__); #ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + auto cu_stream = spsv_descr->cu_stream; CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); #endif }; From c0eae1e46d916e78964a2c7781c06b2176abf1c4 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 24 Oct 2024 11:12:45 +0200 Subject: [PATCH 32/43] Ensure descriptor is kept alive long enough when buffers are used --- .../cusparse/operations/cusparse_spmm.cpp | 34 ++++++++++++++++++- .../cusparse/operations/cusparse_spmv.cpp | 34 ++++++++++++++++++- .../cusparse/operations/cusparse_spsv.cpp | 19 +++++++---- src/sparse_blas/generic_container.hpp | 10 +++--- 4 files changed, 85 insertions(+), 12 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 1b1933693..d73c801b9 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -61,7 +61,39 @@ void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, const std::vector& dependencies) { - return detail::submit_release(queue, spmm_descr, dependencies); + if (!spmm_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmm_descr->cu_handle = nullptr; + spmm_descr->last_optimized_A_handle = nullptr; + spmm_descr->last_optimized_B_handle = nullptr; + spmm_descr->last_optimized_C_handle = nullptr; + delete spmm_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmm_descr->last_optimized_A_handle && + spmm_descr->last_optimized_A_handle->all_use_buffer() && + spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle && + spmm_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { + release_functor(); + }; + return dispatch_submit( + __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle, + spmm_descr->workspace.get_buffer(), spmm_descr->last_optimized_B_handle, + spmm_descr->last_optimized_C_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; } inline auto get_cuda_spmm_alg(spmm_alg alg) { diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index a804574eb..0510ba20f 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -60,7 +60,39 @@ void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) { sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, const std::vector &dependencies) { - return detail::submit_release(queue, spmv_descr, dependencies); + if (!spmv_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmv_descr->cu_handle = nullptr; + spmv_descr->last_optimized_A_handle = nullptr; + spmv_descr->last_optimized_x_handle = nullptr; + spmv_descr->last_optimized_y_handle = nullptr; + delete spmv_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmv_descr->last_optimized_A_handle && + spmv_descr->last_optimized_A_handle->all_use_buffer() && + spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle && + spmv_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { + release_functor(); + }; + return dispatch_submit( + __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle, + spmv_descr->workspace.get_buffer(), spmv_descr->last_optimized_x_handle, + spmv_descr->last_optimized_y_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; } inline auto get_cuda_spmv_alg(spmv_alg alg) { diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 73c435bc8..fc3d46ad8 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -62,25 +62,32 @@ void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, const std::vector &dependencies) { if (!spsv_descr) { - return {}; + return detail::collapse_dependencies(queue, dependencies); } auto release_functor = [=]() { CUSPARSE_ERR_FUNC(cusparseSpSV_destroyDescr, spsv_descr->cu_descr); + spsv_descr->cu_handle = nullptr; + spsv_descr->cu_descr = nullptr; + spsv_descr->last_optimized_A_handle = nullptr; + spsv_descr->last_optimized_x_handle = nullptr; + spsv_descr->last_optimized_y_handle = nullptr; delete spsv_descr; }; - // Use dispatch_submit to ensure the backend's descriptor is kept alive as long as the buffers are used + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used // dispatch_submit can only be used if the descriptor's handles are valid if (spsv_descr->last_optimized_A_handle && spsv_descr->last_optimized_A_handle->all_use_buffer() && - spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle) { - auto dispatch_functor = [=](sycl::interop_handle) { + spsv_descr->last_optimized_x_handle && spsv_descr->last_optimized_y_handle && + spsv_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { release_functor(); }; return dispatch_submit( - __func__, queue, dependencies, dispatch_functor, spsv_descr->last_optimized_A_handle, - spsv_descr->last_optimized_x_handle, spsv_descr->last_optimized_y_handle); + __func__, queue, dispatch_functor, spsv_descr->last_optimized_A_handle, + spsv_descr->workspace.get_buffer(), spsv_descr->last_optimized_x_handle, + spsv_descr->last_optimized_y_handle); } // Release used if USM is used or if the descriptor has been released before spsv_optimize has succeeded diff --git a/src/sparse_blas/generic_container.hpp b/src/sparse_blas/generic_container.hpp index 5fa278497..c2e8476a7 100644 --- a/src/sparse_blas/generic_container.hpp +++ b/src/sparse_blas/generic_container.hpp @@ -39,10 +39,12 @@ struct generic_container { // USM pointer, nullptr if the provided data is a buffer. void* usm_ptr; - // Buffer pointer, nullptr if the provided data is a USM pointer. - // The buffer is needed to properly handle the dependencies when the handle is used. - // Use a void* type for the buffer to avoid using template arguments in every function using data handles. - // Using reinterpret does not solve the issue as the returned buffer needs the type of the original buffer for the aligned_allocator. + // Buffer pointer, nullptr if the provided data is a USM pointer. The buffer + // is needed to properly handle the dependencies when the handle is used. + // Use a void* type for the buffer to avoid using template arguments in + // every function using data handles. Using `sycl::buffer::reinterpret` does + // not solve the issue as the returned buffer needs the type of the original + // buffer for the aligned_allocator. std::shared_ptr buffer_ptr; // Underlying USM or buffer data type From 2149e396a3aa5dae45c568c1e387ae3e0f06ba95 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 24 Oct 2024 12:08:11 +0200 Subject: [PATCH 33/43] USM tests using reset_data wait before updating device values --- tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp | 3 +-- tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp | 3 +-- tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 8070633fc..68528f757 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -152,9 +152,8 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); + ev_spmm.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spmm.wait_and_throw(); ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr(main_queue, a_host.size()); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index 2852a2495..e6df1840d 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -144,9 +144,8 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, static_cast(nrows_A)); + ev_spmv.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spmv.wait_and_throw(); ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr(main_queue, a_host.size()); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 03edf8d37..6a3f485a9 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -141,9 +141,8 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl require_diagonal); shuffle_sparse_matrix_if_needed(format, matrix_properties, indexing, ia_host.data(), ja_host.data(), a_host.data(), reset_nnz, mu); + ev_spsv.wait_and_throw(); if (reset_nnz > nnz) { - // Wait before freeing usm pointers - ev_spsv.wait_and_throw(); ia_usm_uptr = malloc_device_uptr(main_queue, ia_host.size()); ja_usm_uptr = malloc_device_uptr(main_queue, ja_host.size()); a_usm_uptr = malloc_device_uptr(main_queue, a_host.size()); From bad6bfb0f00467f2a34caa403200af6018d3706a Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 29 Jul 2024 17:11:35 +0200 Subject: [PATCH 34/43] Force inputs to be copied on device before the optimize step --- .../sparse_blas/source/sparse_spmm_usm.cpp | 43 ++++++++----------- .../sparse_blas/source/sparse_spmv_usm.cpp | 42 ++++++++---------- .../sparse_blas/source/sparse_spsv_usm.cpp | 40 ++++++++--------- 3 files changed, 53 insertions(+), 72 deletions(-) diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 68528f757..f74403d6c 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -85,26 +85,21 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, fpType *b_usm = b_usm_uptr.get(); fpType *c_usm = c_usm_uptr.get(); - std::vector mat_dependencies; - std::vector spmm_dependencies; + std::vector dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spmm_dependencies.push_back( - main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType))); - spmm_dependencies.push_back( - main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType))); fpType *alpha_host_or_usm_ptr = α fpType *beta_host_or_usm_ptr = β if (test_scalar_on_device) { - spmm_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); - spmm_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); beta_host_or_usm_ptr = beta_usm_uptr.get(); } @@ -138,12 +133,10 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); - spmm_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B, - &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, - spmm_dependencies); + &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix( @@ -163,14 +156,14 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spmm)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spmm)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spmm)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spmm)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spmm)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType), ev_spmm)); set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm, ja_usm, a_usm); @@ -185,7 +178,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmm_optimize, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, - descr, workspace_usm.get(), mat_dependencies); + descr, workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spmm = oneapi::mkl::sparse::spmm, main_queue, transpose_A, transpose_B, &alpha, A_view, A_handle, B_handle, &beta, C_handle, alg, descr, diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index e6df1840d..7f083003b 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -78,26 +78,21 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, fpType *x_usm = x_usm_uptr.get(); fpType *y_usm = y_usm_uptr.get(); - std::vector mat_dependencies; - std::vector spmv_dependencies; + std::vector dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spmv_dependencies.push_back( - main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); - spmv_dependencies.push_back( - main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); fpType *alpha_host_or_usm_ptr = α fpType *beta_host_or_usm_ptr = β if (test_scalar_on_device) { - spmv_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); - spmv_dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); beta_host_or_usm_ptr = beta_usm_uptr.get(); } @@ -130,12 +125,11 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, workspace_usm.get(), mat_dependencies); + y_handle, alg, descr, workspace_usm.get(), dependencies); - spmv_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, spmv_dependencies); + y_handle, alg, descr, { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix( @@ -155,14 +149,14 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spmv)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spmv)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spmv)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spmv)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spmv)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType), ev_spmv)); set_matrix_data(main_queue, format, A_handle, nrows_A, ncols_A, nnz, index, ia_usm, ja_usm, a_usm); @@ -177,7 +171,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spmv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, - y_handle, alg, descr, workspace_usm.get(), mat_dependencies); + y_handle, alg, descr, workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spmv = oneapi::mkl::sparse::spmv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, beta_host_or_usm_ptr, diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 6a3f485a9..755bdd155 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -82,24 +82,19 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl fpType *x_usm = x_usm_uptr.get(); fpType *y_usm = y_usm_uptr.get(); - std::vector mat_dependencies; - std::vector spsv_dependencies; + std::vector dependencies; // Copy host to device - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ia_usm, ia_host.data(), ia_host.size() * sizeof(intType))); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(ja_usm, ja_host.data(), ja_host.size() * sizeof(intType))); - mat_dependencies.push_back( - main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); - spsv_dependencies.push_back( - main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); - spsv_dependencies.push_back( - main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); fpType *alpha_host_or_usm_ptr = α if (test_scalar_on_device) { - spsv_dependencies.push_back( - main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); + dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); } @@ -128,12 +123,11 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl sycl::event ev_opt; CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); - spsv_dependencies.push_back(ev_opt); CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - spsv_dependencies); + { ev_opt }); if (reset_data) { intType reset_nnz = generate_random_matrix( @@ -152,14 +146,14 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl } nnz = reset_nnz; - mat_dependencies.clear(); - mat_dependencies.push_back(main_queue.memcpy( - ia_usm, ia_host.data(), ia_host.size() * sizeof(intType), ev_spsv)); - mat_dependencies.push_back(main_queue.memcpy( - ja_usm, ja_host.data(), ja_host.size() * sizeof(intType), ev_spsv)); - mat_dependencies.push_back( + dependencies.clear(); + dependencies.push_back(main_queue.memcpy(ia_usm, ia_host.data(), + ia_host.size() * sizeof(intType), ev_spsv)); + dependencies.push_back(main_queue.memcpy(ja_usm, ja_host.data(), + ja_host.size() * sizeof(intType), ev_spsv)); + dependencies.push_back( main_queue.memcpy(a_usm, a_host.data(), a_host.size() * sizeof(fpType), ev_spsv)); - mat_dependencies.push_back( + dependencies.push_back( main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType), ev_spsv)); set_matrix_data(main_queue, format, A_handle, m, m, nnz, index, ia_usm, ja_usm, a_usm); @@ -173,7 +167,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl CALL_RT_OR_CT(ev_opt = oneapi::mkl::sparse::spsv_optimize, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, - workspace_usm.get(), mat_dependencies); + workspace_usm.get(), dependencies); CALL_RT_OR_CT(ev_spsv = oneapi::mkl::sparse::spsv, main_queue, transpose_val, alpha_host_or_usm_ptr, A_view, A_handle, x_handle, y_handle, alg, descr, From 6318d53140772c6052269df68e2980833f02d748 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 24 Oct 2024 16:27:04 +0200 Subject: [PATCH 35/43] Disable specific case of spmm with csr_alg3 failing --- docs/domains/sparse_linear_algebra.rst | 5 +++++ .../backends/cusparse/operations/cusparse_spmm.cpp | 7 +++++++ 2 files changed, 12 insertions(+) diff --git a/docs/domains/sparse_linear_algebra.rst b/docs/domains/sparse_linear_algebra.rst index 01eccb041..07d90359a 100644 --- a/docs/domains/sparse_linear_algebra.rst +++ b/docs/domains/sparse_linear_algebra.rst @@ -54,6 +54,11 @@ Currently known limitations: - Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3`` and an ``opA`` other than ``transpose::nontrans`` or an ``opB`` ``transpose::conjtrans`` will throw a ``oneapi::mkl::unimplemented`` exception. +- Using ``spmm`` with the algorithm ``spmm_alg::csr_alg3``, + ``opB=transpose::trans`` and real fp64 precision will throw a + ``oneapi::mkl::unimplemented`` exception. This configuration can fail as of + CUDA 12.6.2, see the related issue + `here`_. - Using ``spmv`` with a ``type_view`` other than ``matrix_descr::general`` will throw a ``oneapi::mkl::unimplemented`` exception. - Using ``spsv`` with the algorithm ``spsv_alg::no_optimize_alg`` may still diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index d73c801b9..0af9fb173 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -126,6 +126,13 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o "sparse_blas", function_name, "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`."); } + if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::trans && + A_handle->get_value_type() == detail::data_type::real_fp64) { + // TODO: Remove once the issue is fixed: https://forums.developer.nvidia.com/t/cusparse-spmm-sample-failing-with-misaligned-address/311022 + throw mkl::unimplemented( + "sparse_blas", function_name, + "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::trans` and the real fp64 precision is used."); + } } void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, From 956ae496aa68275d0f7df1e7c94f7c7960d16ce6 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Thu, 24 Oct 2024 17:59:01 +0200 Subject: [PATCH 36/43] Disable enqueue_native_command extension for out-of-order queues --- .../backends/cusparse/cusparse_task.hpp | 45 ++++++++++++++++--- .../cusparse/operations/cusparse_spmm.cpp | 6 +-- .../cusparse/operations/cusparse_spmv.cpp | 6 +-- .../cusparse/operations/cusparse_spsv.cpp | 6 +-- 4 files changed, 46 insertions(+), 17 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index 4c187c3db..3c51e8514 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -185,13 +185,18 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, /// \p UseEnqueueNativeCommandExt controls whether host_task are used or the /// extension ext_codeplay_enqueue_native_command is used to launch tasks. The /// extension should only be used for asynchronous functions using native -/// backend's functions. +/// backend's functions. The extension can only be used for in-order queues as +/// the same cuStream needs to be used for the 3 steps to run an operation: +/// querying the buffer size, optimizing and running the computation. This means +/// a different cuStream can be used inside the native_command than the native +/// cuStream used by the extension. template sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl::queue queue, const std::vector &dependencies, Functor functor, matrix_handle_t sm_handle, sycl::buffer workspace_buffer, Ts... other_containers) { + bool is_in_order_queue = queue.is_in_order(); if (sm_handle->all_use_buffer()) { detail::data_type value_type = sm_handle->get_value_type(); detail::data_type int_type = sm_handle->get_int_type(); @@ -204,8 +209,14 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: auto workspace_acc = workspace_buffer.get_access(cgh); \ if constexpr (UseWorkspace) { \ if constexpr (UseEnqueueNativeCommandExt) { \ - submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ - workspace_acc, fp_accs, int_accs); \ + if (is_in_order_queue) { \ + submit_native_command_ext_with_acc(cgh, queue, functor, dependencies, \ + workspace_acc, fp_accs, int_accs); \ + } \ + else { \ + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, \ + int_accs); \ + } \ } \ else { \ submit_host_task_with_acc(cgh, queue, functor, workspace_acc, fp_accs, int_accs); \ @@ -214,7 +225,13 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: else { \ (void)workspace_buffer; \ if constexpr (UseEnqueueNativeCommandExt) { \ - submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, int_accs); \ + if (is_in_order_queue) { \ + submit_native_command_ext(cgh, queue, functor, dependencies, fp_accs, \ + int_accs); \ + } \ + else { \ + submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ + } \ } \ else { \ submit_host_task(cgh, queue, functor, fp_accs, int_accs); \ @@ -254,7 +271,12 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: return queue.submit([&](sycl::handler &cgh) { cgh.depends_on(dependencies); if constexpr (UseEnqueueNativeCommandExt) { - submit_native_command_ext(cgh, queue, functor, dependencies); + if (is_in_order_queue) { + submit_native_command_ext(cgh, queue, functor, dependencies); + } + else { + submit_host_task(cgh, queue, functor); + } } else { submit_host_task(cgh, queue, functor); @@ -392,6 +414,19 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q function_name, queue, {}, functor, sm_handle, no_workspace, other_containers...); } +// Helper function for functors submitted to host_task or native_command. +// When the extension is disabled, host_task are used and the synchronization is needed to ensure the sycl::event corresponds to the end of the whole functor. +// When the extension is enabled, host_task are still used for out-of-order queues, see description of dispatch_submit_impl_fp_int. +inline void synchronize_if_needed(bool is_in_order_queue, CUstream cu_stream) { +#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); +#else + if (!is_in_order_queue) { + CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); + } +#endif +} + } // namespace oneapi::mkl::sparse::cusparse #endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index 0af9fb173..b4d8c6b77 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -287,6 +287,7 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr CHECK_DESCR_MATCH(spmm_descr, C_handle, "spmm_optimize"); CHECK_DESCR_MATCH(spmm_descr, alg, "spmm_optimize"); + bool is_in_order_queue = queue.is_in_order(); auto compute_functor = [=](void* workspace_ptr) { auto cu_handle = spmm_descr->cu_handle; auto cu_a = A_handle->backend_handle; @@ -301,10 +302,7 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c, cu_type, cu_alg, workspace_ptr); check_status(status, __func__); -#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - auto cu_stream = spmm_descr->cu_stream; - CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); -#endif + synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream); }; if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index 0510ba20f..f00650f65 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -287,6 +287,7 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp CHECK_DESCR_MATCH(spmv_descr, y_handle, "spmv_optimize"); CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize"); + bool is_in_order_queue = queue.is_in_order(); auto compute_functor = [=](void *workspace_ptr) { auto cu_handle = spmv_descr->cu_handle; auto cu_a = A_handle->backend_handle; @@ -300,10 +301,7 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg, workspace_ptr); check_status(status, __func__); -#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - auto cu_stream = spmv_descr->cu_stream; - CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); -#endif + synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream); }; if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index fc3d46ad8..392318460 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -257,6 +257,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp CHECK_DESCR_MATCH(spsv_descr, y_handle, "spsv_optimize"); CHECK_DESCR_MATCH(spsv_descr, alg, "spsv_optimize"); + bool is_in_order_queue = queue.is_in_order(); auto functor = [=](sycl::interop_handle) { auto cu_handle = spsv_descr->cu_handle; auto cu_a = A_handle->backend_handle; @@ -272,10 +273,7 @@ sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, cu_descr); check_status(status, __func__); -#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - auto cu_stream = spsv_descr->cu_stream; - CUDA_ERROR_FUNC(cuStreamSynchronize, cu_stream); -#endif + synchronize_if_needed(is_in_order_queue, spsv_descr->cu_stream); }; return dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); From 43428ca71672d7e0cbcd181d5dfb45bf5f5229b8 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 10:54:02 +0200 Subject: [PATCH 37/43] Test in-order queues --- .../sparse_blas/include/test_spmm.hpp | 126 ++++++++++-------- .../sparse_blas/include/test_spmv.hpp | 119 +++++++++-------- .../sparse_blas/include/test_spsv.hpp | 87 ++++++------ .../sparse_blas/source/sparse_spmm_buffer.cpp | 7 +- .../sparse_blas/source/sparse_spmm_usm.cpp | 7 +- .../sparse_blas/source/sparse_spmv_buffer.cpp | 5 +- .../sparse_blas/source/sparse_spmv_usm.cpp | 5 +- .../sparse_blas/source/sparse_spsv_buffer.cpp | 5 +- .../sparse_blas/source/sparse_spsv_usm.cpp | 5 +- 9 files changed, 202 insertions(+), 164 deletions(-) diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp index d47b1732c..983c0e63c 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp @@ -58,6 +58,7 @@ void test_helper_with_format_with_transpose( const std::vector &non_default_algorithms, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, int &num_passed, int &num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value()(0.f, 0.f); fpType fp_one = set_fp_value()(1.f, 0.f); @@ -85,108 +86,119 @@ void test_helper_with_format_with_transpose( // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, default_properties, true, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, default_properties, no_reset_data, true), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - oneapi::mkl::index_base::one, col_major, transpose_A, transpose_B, - fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, oneapi::mkl::index_base::one, col_major, transpose_A, + transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, set_fp_value()(2.f, 1.5f), - fp_zero, ldb, ldc, default_alg, default_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + set_fp_value()(2.f, 1.5f), fp_zero, ldb, ldc, default_alg, + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test non-default beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, - set_fp_value()(3.2f, 1.f), ldb, ldc, default_alg, + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, set_fp_value()(3.2f, 1.f), ldb, ldc, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_zero, fp_one, ldb, ldc, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_zero, fp_one, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_zero, fp_zero, ldb, ldc, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_zero, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldb EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb + 5, ldc, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb + 5, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default ldc EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc + 6, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc + 6, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test row major layout EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - oneapi::mkl::layout::row_major, transpose_A, transpose_B, fp_one, - fp_zero, ncols_B, ncols_C, default_alg, default_A_view, - default_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, oneapi::mkl::layout::row_major, + transpose_A, transpose_B, fp_one, fp_zero, ncols_B, ncols_C, + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test int64 indices long long_nrows_A = 27, long_ncols_A = 13, long_ncols_C = 6; auto [long_ldc, long_ldb] = swap_if_transposed(transpose_A, long_nrows_A, long_ncols_A); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i64(dev, format, long_nrows_A, long_ncols_A, long_ncols_C, - density_A_matrix, index_zero, col_major, transpose_A, transpose_B, - fp_one, fp_zero, long_ldb, long_ldc, default_alg, default_A_view, - default_properties, no_reset_data, no_scalars_on_device), + test_functor_i64(dev, queue_properties, format, long_nrows_A, long_ncols_A, + long_ncols_C, density_A_matrix, index_zero, col_major, transpose_A, + transpose_B, fp_one, fp_zero, long_ldb, long_ldc, default_alg, + default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, - ldb, ldc, alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, - index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, - ldb, ldc, default_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { property::queue::in_order::in_order() }, format, nrows_A, + ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, + transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } { // Test different sizes @@ -199,10 +211,10 @@ void test_helper_with_format_with_transpose( int ldb = nrows_B; int ldc = nrows_C; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, ncols_C, density_A_matrix, index_zero, - col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, ncols_C, + density_A_matrix, index_zero, col_major, transpose_A, transpose_B, + fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } } diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp index 66af38a7c..e22b5a9e7 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp @@ -56,6 +56,7 @@ void test_helper_with_format_with_transpose( sparse_matrix_format_t format, const std::vector &non_default_algorithms, oneapi::mkl::transpose transpose_val, int &num_passed, int &num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value()(0.f, 0.f); fpType fp_one = set_fp_value()(1.f, 0.f); @@ -72,133 +73,143 @@ void test_helper_with_format_with_transpose( // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, default_properties, true, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, true, no_scalars_on_device), num_passed, num_skipped); // Test alpha and beta on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, default_A_view, default_properties, - no_reset_data, true), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, oneapi::mkl::index_base::one, transpose_val, fp_one, fp_zero, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - set_fp_value()(2.f, 1.5f), fp_zero, default_alg, default_A_view, - default_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, set_fp_value()(2.f, 1.5f), fp_zero, + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test non-default beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, set_fp_value()(3.2f, 1.f), default_alg, default_A_view, - default_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, set_fp_value()(3.2f, 1.f), + default_alg, default_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_one, default_alg, default_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_zero, fp_one, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test 0 alpha and beta EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_zero, fp_zero, default_alg, default_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_zero, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i64(dev, format, 27L, 13L, density_A_matrix, index_zero, transpose_val, fp_one, - fp_zero, default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i64(dev, queue_properties, format, 27L, 13L, density_A_matrix, index_zero, + transpose_val, fp_one, fp_zero, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular oneapi::mkl::sparse::matrix_view triangular_A_view( oneapi::mkl::sparse::matrix_descr::triangular); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper triangular triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, triangular_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower triangular unit diagonal oneapi::mkl::sparse::matrix_view triangular_unit_A_view( oneapi::mkl::sparse::matrix_descr::triangular); triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + triangular_unit_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Upper triangular unit diagonal triangular_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, triangular_unit_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + triangular_unit_A_view, default_properties, no_reset_data, + no_scalars_on_device), num_passed, num_skipped); // Lower symmetric oneapi::mkl::sparse::matrix_view symmetric_view(oneapi::mkl::sparse::matrix_descr::symmetric); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper symmetric symmetric_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, symmetric_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, symmetric_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Lower hermitian oneapi::mkl::sparse::matrix_view hermitian_view(oneapi::mkl::sparse::matrix_descr::hermitian); EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Upper hermitian hermitian_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, transpose_val, - fp_one, fp_zero, default_alg, hermitian_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, hermitian_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test other algorithms for (auto alg : non_default_algorithms) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, - transpose_val, fp_one, fp_zero, alg, default_A_view, + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } // Test matrix properties for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, nrows_A, ncols_A, density_A_matrix, index_zero, - transpose_val, fp_one, fp_zero, default_alg, default_A_view, - properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, nrows_A, ncols_A, density_A_matrix, + index_zero, transpose_val, fp_one, fp_zero, default_alg, + default_A_view, properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { property::queue::in_order::in_order() }, format, nrows_A, ncols_A, + density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } /** diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp index ca58dfd7a..aabc0f569 100644 --- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp @@ -51,6 +51,7 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes sycl::device *dev, sparse_matrix_format_t format, oneapi::mkl::transpose transpose_val, int &num_passed, int &num_skipped) { + sycl::property_list queue_properties; double density_A_matrix = 0.144; fpType alpha = set_fp_value()(1.f, 0.f); int m = 277; @@ -69,87 +70,95 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes // Basic test EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Reset data - EXPECT_TRUE_OR_FUTURE_SKIP(test_functor_i32(dev, format, m, density_A_matrix, index_zero, - transpose_val, alpha, default_alg, default_A_view, - default_properties, true, no_scalars_on_device), - num_passed, num_skipped); + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + true, no_scalars_on_device), + num_passed, num_skipped); // Test alpha on the device EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, default_properties, no_reset_data, true), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, true), num_passed, num_skipped); // Test index_base 1 EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, oneapi::mkl::index_base::one, - transpose_val, alpha, default_alg, default_A_view, default_properties, - no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, + oneapi::mkl::index_base::one, transpose_val, alpha, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular matrix EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, upper_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, upper_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test lower triangular unit diagonal matrix oneapi::mkl::sparse::matrix_view triangular_unit_A_view( oneapi::mkl::sparse::matrix_descr::triangular); triangular_unit_A_view.diag_view = oneapi::mkl::diag::unit; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, triangular_unit_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper triangular unit diagonal matrix triangular_unit_A_view.uplo_view = oneapi::mkl::uplo::upper; EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, triangular_unit_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, triangular_unit_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test non-default alpha EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, - set_fp_value()(2.f, 1.5f), default_alg, default_A_view, - default_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, set_fp_value()(2.f, 1.5f), default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test int64 indices EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i64(dev, format, 15L, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i64(dev, queue_properties, format, 15L, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test lower no_optimize_alg EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, default_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test upper no_optimize_alg EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, upper_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, upper_A_view, default_properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test matrix properties for (auto properties : get_all_matrix_properties_combinations(properties_queue, format)) { // Basic test with matrix properties EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - default_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, default_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); // Test lower no_optimize_alg with matrix properties EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, format, m, density_A_matrix, index_zero, transpose_val, alpha, - no_optimize_alg, default_A_view, properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, queue_properties, format, m, density_A_matrix, index_zero, + transpose_val, alpha, no_optimize_alg, default_A_view, properties, + no_reset_data, no_scalars_on_device), num_passed, num_skipped); } + // In-order queue + EXPECT_TRUE_OR_FUTURE_SKIP( + test_functor_i32(dev, { property::queue::in_order::in_order() }, format, m, + density_A_matrix, index_zero, transpose_val, alpha, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), + num_passed, num_skipped); } /** diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp index 4a37e8c7c..f76048386 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp @@ -28,8 +28,9 @@ extern std::vector devices; namespace { template -int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, - intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, +int test_spmm(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, + double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, @@ -40,7 +41,7 @@ int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index f74403d6c..9618ef870 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -28,15 +28,16 @@ extern std::vector devices; namespace { template -int test_spmm(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, - intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, +int test_spmm(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, + double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set &matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp index f56deaf91..20a4b6f16 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp @@ -28,7 +28,8 @@ extern std::vector devices; namespace { template -int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, +int test_spmv(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, @@ -38,7 +39,7 @@ int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index 7f083003b..bf5fac9da 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -28,13 +28,14 @@ extern std::vector devices; namespace { template -int test_spmv(sycl::device *dev, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, +int test_spmv(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set &matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); if (require_square_matrix(A_view, matrix_properties)) { ncols_A = nrows_A; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index ebf47fd5e..163285e07 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -28,7 +28,8 @@ extern std::vector devices; namespace { template -int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, double density_A_matrix, +int test_spsv(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set &matrix_properties, @@ -37,7 +38,7 @@ int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, doubl // Scalars on the device is not planned to be supported with the buffer API return 1; } - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast(m); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index 755bdd155..a5d24829f 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -28,12 +28,13 @@ extern std::vector devices; namespace { template -int test_spsv(sycl::device *dev, sparse_matrix_format_t format, intType m, double density_A_matrix, +int test_spsv(sycl::device *dev, sycl::property_list queue_properties, + sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, const std::set &matrix_properties, bool reset_data, bool test_scalar_on_device) { - sycl::queue main_queue(*dev, exception_handler_t()); + sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); intType indexing = (index == oneapi::mkl::index_base::zero) ? 0 : 1; const std::size_t mu = static_cast(m); From 790d0182334151e848e522efb8558aacaa23617b Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 11:18:33 +0200 Subject: [PATCH 38/43] clang-format-19.1 --- .pre-commit-config.yaml | 8 + _clang-format | 96 +- .../level3/gemm_usm_mklcpu_cublas.cpp | 28 +- examples/include/example_helper.hpp | 6 +- .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp | 30 +- .../sparse_blas_spmv_usm.cpp | 32 +- include/oneapi/mkl/bfloat16.hpp | 50 +- include/oneapi/mkl/blas.hxx | 4028 +++++---- .../mkl/blas/detail/blas_ct_backends.hxx | 3471 ++++---- .../oneapi/mkl/blas/detail/blas_loader.hxx | 4457 +++++----- .../oneapi/mkl/blas/detail/cublas/blas_ct.hxx | 2920 ++++--- .../blas/detail/cublas/onemkl_blas_cublas.hxx | 3959 +++++---- .../oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx | 2920 ++++--- .../oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx | 2920 ++++--- .../oneapi/mkl/blas/detail/netlib/blas_ct.hxx | 2925 ++++--- .../mkl/blas/detail/onemkl_blas_backends.hxx | 4543 +++++------ .../mkl/blas/detail/portblas/blas_ct.hxx | 2670 +++--- .../mkl/blas/detail/rocblas/blas_ct.hxx | 2492 +++--- .../detail/rocblas/onemkl_blas_rocblas.hxx | 3426 ++++---- include/oneapi/mkl/detail/exceptions.hpp | 6 +- include/oneapi/mkl/detail/get_device_id.hpp | 2 +- include/oneapi/mkl/dft/backward.hpp | 54 +- include/oneapi/mkl/dft/detail/commit_impl.hpp | 106 +- include/oneapi/mkl/dft/detail/dft_ct.hxx | 106 +- include/oneapi/mkl/dft/detail/types_impl.hpp | 2 +- include/oneapi/mkl/dft/forward.hpp | 52 +- include/oneapi/mkl/exceptions.hpp | 38 +- .../mkl/lapack/detail/cusolver/lapack_ct.hxx | 1782 ++-- .../cusolver/onemkl_lapack_cusolver.hxx | 2342 +++--- .../mkl/lapack/detail/lapack_loader.hpp | 2814 +++---- .../oneapi/mkl/lapack/detail/lapack_rt.hpp | 2252 ++--- .../lapack/detail/mkl_common/lapack_ct.hxx | 1758 ++-- .../mkl_common/onemkl_lapack_backends.hxx | 2728 +++---- .../mkl/lapack/detail/rocsolver/lapack_ct.hxx | 2798 +++---- .../rocsolver/onemkl_lapack_rocsolver.hxx | 2351 +++--- include/oneapi/mkl/lapack/exceptions.hpp | 16 +- .../mkl/sparse_blas/detail/helper_types.hpp | 2 +- .../detail/onemkl_sparse_blas_backends.hxx | 168 +- .../mkl/sparse_blas/detail/sparse_blas_ct.hxx | 122 +- .../mkl/sparse_blas/detail/sparse_blas_rt.hpp | 146 +- src/blas/backends/cublas/cublas_batch.cpp | 1614 ++-- .../backends/cublas/cublas_extensions.cpp | 592 +- src/blas/backends/cublas/cublas_handle.hpp | 4 +- src/blas/backends/cublas/cublas_helper.hpp | 4 +- src/blas/backends/cublas/cublas_level1.cpp | 874 +- src/blas/backends/cublas/cublas_level2.cpp | 1596 ++-- src/blas/backends/cublas/cublas_level3.cpp | 820 +- .../backends/cublas/cublas_scope_handle.cpp | 12 +- .../backends/cublas/cublas_scope_handle.hpp | 14 +- .../cublas/cublas_scope_handle_hipsycl.cpp | 6 +- .../cublas/cublas_scope_handle_hipsycl.hpp | 8 +- src/blas/backends/cublas/cublas_task.hpp | 6 +- src/blas/backends/mkl_common/mkl_batch.cxx | 904 +-- .../backends/mkl_common/mkl_blas_backend.hxx | 3874 ++++----- .../backends/mkl_common/mkl_extensions.cxx | 340 +- src/blas/backends/mkl_common/mkl_level1.cxx | 538 +- src/blas/backends/mkl_common/mkl_level2.cxx | 830 +- src/blas/backends/mkl_common/mkl_level3.cxx | 488 +- src/blas/backends/netlib/netlib_batch.cxx | 970 ++- src/blas/backends/netlib/netlib_common.hpp | 6 +- .../backends/netlib/netlib_extensions.cxx | 346 +- src/blas/backends/netlib/netlib_level1.cpp | 20 +- src/blas/backends/netlib/netlib_level1.cxx | 746 +- src/blas/backends/netlib/netlib_level2.cxx | 1186 +-- src/blas/backends/netlib/netlib_level3.cxx | 700 +- src/blas/backends/portblas/portblas_batch.cxx | 978 +-- .../backends/portblas/portblas_gemm_bias.cxx | 72 +- .../backends/portblas/portblas_level1.cxx | 298 +- .../backends/portblas/portblas_level2.cxx | 456 +- .../backends/portblas/portblas_level3.cxx | 386 +- .../portblas/portblas_level3_bfloat16.cpp | 32 +- .../portblas/portblas_level3_half.cpp | 56 +- src/blas/backends/rocblas/rocblas_batch.cpp | 1296 +-- .../backends/rocblas/rocblas_extensions.cpp | 544 +- src/blas/backends/rocblas/rocblas_handle.hpp | 4 +- src/blas/backends/rocblas/rocblas_helper.hpp | 4 +- src/blas/backends/rocblas/rocblas_level1.cpp | 830 +- src/blas/backends/rocblas/rocblas_level2.cpp | 1780 ++-- src/blas/backends/rocblas/rocblas_level3.cpp | 800 +- .../backends/rocblas/rocblas_scope_handle.cpp | 14 +- .../backends/rocblas/rocblas_scope_handle.hpp | 14 +- .../rocblas/rocblas_scope_handle_hipsycl.cpp | 8 +- .../rocblas/rocblas_scope_handle_hipsycl.hpp | 10 +- src/blas/backends/rocblas/rocblas_task.hpp | 6 +- src/blas/blas_loader.cpp | 6936 ++++++++-------- src/blas/function_table.hpp | 7208 ++++++++--------- .../backend_backward_instantiations.cxx | 44 +- .../backend_forward_instantiations.cxx | 44 +- src/dft/backends/cufft/backward.cpp | 74 +- src/dft/backends/cufft/execute_helper.hpp | 36 +- src/dft/backends/cufft/forward.cpp | 72 +- src/dft/backends/descriptor.cpp | 10 +- src/dft/backends/mklcpu/backward.cpp | 102 +- src/dft/backends/mklcpu/forward.cpp | 100 +- src/dft/backends/mklgpu/backward.cpp | 62 +- src/dft/backends/mklgpu/forward.cpp | 59 +- src/dft/backends/portfft/portfft_helper.hpp | 8 +- src/dft/backends/rocfft/backward.cpp | 140 +- src/dft/backends/rocfft/execute_helper.hpp | 16 +- src/dft/backends/rocfft/forward.cpp | 138 +- src/include/allocator_helper.hpp | 4 +- src/include/function_table_initializer.hpp | 10 +- .../backends/cusolver/cusolver_batch.cpp | 962 +-- .../backends/cusolver/cusolver_handle.hpp | 4 +- .../backends/cusolver/cusolver_helper.hpp | 22 +- .../backends/cusolver/cusolver_lapack.cpp | 1734 ++-- .../cusolver/cusolver_scope_handle.cpp | 12 +- .../cusolver/cusolver_scope_handle.hpp | 14 +- .../backends/cusolver/cusolver_task.hpp | 4 +- src/lapack/backends/mkl_common/mkl_lapack.cxx | 2284 +++--- .../mkl_common/mkl_lapack_backend.hpp | 2008 ++--- .../backends/rocsolver/rocsolver_batch.cpp | 902 +-- .../backends/rocsolver/rocsolver_handle.hpp | 4 +- .../backends/rocsolver/rocsolver_helper.hpp | 12 +- .../backends/rocsolver/rocsolver_lapack.cpp | 1502 ++-- .../rocsolver/rocsolver_scope_handle.cpp | 12 +- .../rocsolver/rocsolver_scope_handle.hpp | 12 +- .../backends/rocsolver/rocsolver_task.hpp | 4 +- src/lapack/function_table.hpp | 2660 +++--- src/lapack/lapack_loader.cpp | 2430 +++--- src/rng/backends/curand/curand_task.hpp | 16 +- src/rng/backends/mklcpu/cpu_common.hpp | 8 +- src/rng/backends/rocrand/rocrand_task.hpp | 16 +- .../cusparse/cusparse_global_handle.hpp | 4 +- .../backends/cusparse/cusparse_handles.cpp | 106 +- .../cusparse/cusparse_scope_handle.cpp | 14 +- .../cusparse/cusparse_scope_handle.hpp | 18 +- .../backends/cusparse/cusparse_task.hpp | 60 +- .../cusparse/operations/cusparse_spmv.cpp | 40 +- .../cusparse/operations/cusparse_spsv.cpp | 30 +- .../backends/mkl_common/mkl_handles.cxx | 92 +- .../backends/mkl_common/mkl_spmm.cxx | 58 +- .../backends/mkl_common/mkl_spmv.cxx | 44 +- .../backends/mkl_common/mkl_spsv.cxx | 32 +- src/sparse_blas/common_op_verification.hpp | 10 +- src/sparse_blas/function_table.hpp | 128 +- src/sparse_blas/macros.hpp | 20 +- src/sparse_blas/sparse_blas_loader.cpp | 150 +- src/sparse_blas/sycl_helper.hpp | 16 +- .../blas/batch/axpy_batch_stride.cpp | 20 +- .../blas/batch/axpy_batch_stride_usm.cpp | 20 +- .../unit_tests/blas/batch/axpy_batch_usm.cpp | 61 +- .../blas/batch/copy_batch_stride.cpp | 20 +- .../blas/batch/copy_batch_stride_usm.cpp | 20 +- .../unit_tests/blas/batch/copy_batch_usm.cpp | 59 +- .../blas/batch/dgmm_batch_stride.cpp | 23 +- .../blas/batch/dgmm_batch_stride_usm.cpp | 23 +- .../unit_tests/blas/batch/dgmm_batch_usm.cpp | 69 +- .../blas/batch/gemm_batch_stride.cpp | 27 +- .../blas/batch/gemm_batch_stride_usm.cpp | 35 +- .../unit_tests/blas/batch/gemm_batch_usm.cpp | 88 +- .../blas/batch/gemv_batch_stride.cpp | 27 +- .../blas/batch/gemv_batch_stride_usm.cpp | 24 +- .../unit_tests/blas/batch/gemv_batch_usm.cpp | 73 +- .../blas/batch/imatcopy_batch_stride.cpp | 16 +- .../blas/batch/imatcopy_batch_stride_usm.cpp | 20 +- .../blas/batch/imatcopy_batch_usm.cpp | 24 +- .../blas/batch/omatadd_batch_stride.cpp | 16 +- .../blas/batch/omatadd_batch_stride_usm.cpp | 24 +- .../blas/batch/omatcopy_batch_stride.cpp | 16 +- .../blas/batch/omatcopy_batch_stride_usm.cpp | 22 +- .../blas/batch/omatcopy_batch_usm.cpp | 38 +- .../blas/batch/syrk_batch_stride.cpp | 30 +- .../blas/batch/syrk_batch_stride_usm.cpp | 36 +- .../unit_tests/blas/batch/syrk_batch_usm.cpp | 65 +- .../blas/batch/trsm_batch_stride.cpp | 23 +- .../blas/batch/trsm_batch_stride_usm.cpp | 23 +- .../unit_tests/blas/batch/trsm_batch_usm.cpp | 68 +- tests/unit_tests/blas/extensions/imatcopy.cpp | 16 +- .../blas/extensions/imatcopy_usm.cpp | 16 +- tests/unit_tests/blas/extensions/omatadd.cpp | 16 +- .../blas/extensions/omatadd_usm.cpp | 16 +- tests/unit_tests/blas/extensions/omatcopy.cpp | 16 +- .../unit_tests/blas/extensions/omatcopy2.cpp | 16 +- .../blas/extensions/omatcopy2_usm.cpp | 16 +- .../blas/extensions/omatcopy_usm.cpp | 16 +- .../blas/include/reference_blas_templates.hpp | 1304 ++- .../blas/include/reference_blas_wrappers.hpp | 1470 ++-- tests/unit_tests/blas/include/test_common.hpp | 72 +- tests/unit_tests/blas/level1/axpby.cpp | 22 +- tests/unit_tests/blas/level1/axpby_usm.cpp | 20 +- tests/unit_tests/blas/level1/axpy.cpp | 19 +- tests/unit_tests/blas/level1/axpy_usm.cpp | 19 +- tests/unit_tests/blas/level1/dotc.cpp | 18 +- tests/unit_tests/blas/level1/dotc_usm.cpp | 20 +- tests/unit_tests/blas/level1/dotu.cpp | 18 +- tests/unit_tests/blas/level1/dotu_usm.cpp | 20 +- tests/unit_tests/blas/level1/rot.cpp | 21 +- tests/unit_tests/blas/level1/rot_usm.cpp | 20 +- tests/unit_tests/blas/level1/rotg.cpp | 18 +- tests/unit_tests/blas/level1/rotg_usm.cpp | 36 +- tests/unit_tests/blas/level1/rotm.cpp | 20 +- tests/unit_tests/blas/level1/rotm_usm.cpp | 20 +- tests/unit_tests/blas/level1/rotmg_usm.cpp | 30 +- tests/unit_tests/blas/level1/sdsdot.cpp | 20 +- tests/unit_tests/blas/level1/sdsdot_usm.cpp | 22 +- tests/unit_tests/blas/level2/gbmv.cpp | 20 +- tests/unit_tests/blas/level2/gbmv_usm.cpp | 20 +- tests/unit_tests/blas/level2/gemv.cpp | 20 +- tests/unit_tests/blas/level2/gemv_usm.cpp | 20 +- tests/unit_tests/blas/level2/ger.cpp | 21 +- tests/unit_tests/blas/level2/ger_usm.cpp | 20 +- tests/unit_tests/blas/level2/gerc.cpp | 20 +- tests/unit_tests/blas/level2/gerc_usm.cpp | 20 +- tests/unit_tests/blas/level2/geru.cpp | 20 +- tests/unit_tests/blas/level2/geru_usm.cpp | 20 +- tests/unit_tests/blas/level2/hbmv.cpp | 20 +- tests/unit_tests/blas/level2/hbmv_usm.cpp | 20 +- tests/unit_tests/blas/level2/hemv.cpp | 20 +- tests/unit_tests/blas/level2/hemv_usm.cpp | 20 +- tests/unit_tests/blas/level2/her.cpp | 19 +- tests/unit_tests/blas/level2/her2.cpp | 20 +- tests/unit_tests/blas/level2/her2_usm.cpp | 20 +- tests/unit_tests/blas/level2/her_usm.cpp | 18 +- tests/unit_tests/blas/level2/hpmv.cpp | 20 +- tests/unit_tests/blas/level2/hpmv_usm.cpp | 20 +- tests/unit_tests/blas/level2/hpr.cpp | 19 +- tests/unit_tests/blas/level2/hpr2.cpp | 20 +- tests/unit_tests/blas/level2/hpr2_usm.cpp | 20 +- tests/unit_tests/blas/level2/hpr_usm.cpp | 18 +- tests/unit_tests/blas/level2/sbmv.cpp | 20 +- tests/unit_tests/blas/level2/sbmv_usm.cpp | 20 +- tests/unit_tests/blas/level2/spmv.cpp | 20 +- tests/unit_tests/blas/level2/spmv_usm.cpp | 20 +- tests/unit_tests/blas/level2/spr.cpp | 19 +- tests/unit_tests/blas/level2/spr2.cpp | 20 +- tests/unit_tests/blas/level2/spr2_usm.cpp | 20 +- tests/unit_tests/blas/level2/spr_usm.cpp | 18 +- tests/unit_tests/blas/level2/symv.cpp | 20 +- tests/unit_tests/blas/level2/symv_usm.cpp | 20 +- tests/unit_tests/blas/level2/syr.cpp | 19 +- tests/unit_tests/blas/level2/syr2.cpp | 20 +- tests/unit_tests/blas/level2/syr2_usm.cpp | 20 +- tests/unit_tests/blas/level2/syr_usm.cpp | 18 +- .../dft/include/compute_inplace.hpp | 18 +- .../dft/include/compute_out_of_place.hpp | 19 +- .../unit_tests/dft/include/reference_dft.hpp | 10 +- tests/unit_tests/dft/include/test_common.hpp | 33 +- tests/unit_tests/dft/source/compute_tests.cpp | 18 +- tests/unit_tests/include/test_helper.hpp | 18 +- .../include/lapack_reference_wrappers.hpp | 711 +- tests/unit_tests/main_test.cpp | 2 +- .../unit_tests/rng/device/include/moments.hpp | 5 +- .../device/include/rng_device_test_common.hpp | 7 +- .../rng/include/rng_test_common.hpp | 7 +- .../include/common_sparse_reference.hpp | 14 +- .../sparse_blas/include/test_common.hpp | 58 +- .../sparse_blas/include/test_spmm.hpp | 26 +- .../sparse_blas/include/test_spmv.hpp | 22 +- .../sparse_blas/include/test_spsv.hpp | 18 +- .../sparse_blas/source/sparse_spmm_buffer.cpp | 14 +- .../sparse_blas/source/sparse_spmm_usm.cpp | 28 +- .../sparse_blas/source/sparse_spmv_buffer.cpp | 14 +- .../sparse_blas/source/sparse_spmv_usm.cpp | 28 +- .../sparse_blas/source/sparse_spsv_buffer.cpp | 14 +- .../sparse_blas/source/sparse_spsv_usm.cpp | 26 +- 256 files changed, 60219 insertions(+), 61206 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 000000000..1be15eecd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,8 @@ +repos: + +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v19.1.0 + hooks: + - id: clang-format + files: \.(c|cxx|cpp|h|hxx|hpp)$ + exclude: ^deps/ diff --git a/_clang-format b/_clang-format index 37a50f367..aef7d8e0f 100644 --- a/_clang-format +++ b/_clang-format @@ -21,31 +21,31 @@ Language: Cpp AccessModifierOffset: -4 AlignAfterOpenBracket: Align -AlignConsecutiveAssignments: false -AlignConsecutiveDeclarations: false -AlignConsecutiveMacros: true +AlignConsecutiveAssignments: + Enabled: false +AlignConsecutiveDeclarations: + Enabled: false +AlignConsecutiveMacros: + Enabled: true AlignEscapedNewlines: Left -AlignOperands: true -AlignTrailingComments: false +AlignOperands: Align +AlignTrailingComments: + Kind: Never AllowAllArgumentsOnNextLine: true -AllowAllConstructorInitializersOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: false +AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: Inline AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: Yes BinPackArguments: true BinPackParameters: true BraceWrapping: AfterCaseLabel: false AfterClass: false - AfterControlStatement: false + AfterControlStatement: Never AfterEnum: false AfterFunction: false AfterNamespace: false @@ -59,46 +59,33 @@ BraceWrapping: SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false +BreakAfterReturnType: Automatic BreakBeforeBinaryOperators: None BreakBeforeBraces: Custom BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon BreakInheritanceList: BeforeColon BreakStringLiterals: false +BreakTemplateDeclarations: Yes ColumnLimit: 100 -CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false -ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 8 ContinuationIndentWidth: 4 Cpp11BracedListStyle: false -DerivePointerAlignment: true +DerivePointerAlignment: false DisableFormat: false FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^' - Priority: 2 - - Regex: '^<.*\.h>' - Priority: 1 - - Regex: '^<.*' - Priority: 2 - - Regex: '.*' - Priority: 3 -IncludeIsMainRegex: '([-_](test|unittest))?$' IndentCaseLabels: true IndentPPDirectives: None IndentWidth: 4 IndentWrappedFunctionNames: false -KeepEmptyLinesAtTheStartOfBlocks: false -MacroBlockBegin: '' -MacroBlockEnd: '' +KeepEmptyLines: + AtEndOfFile: false + AtStartOfBlock: false + AtStartOfFile: false MaxEmptyLinesToKeep: 1 NamespaceIndentation: None +PackConstructorInitializers: CurrentLine PenaltyBreakAssignment: 2 PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 @@ -108,57 +95,24 @@ PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 PointerAlignment: Left -RawStringFormats: - - Language: Cpp - Delimiters: - - cc - - CC - - cpp - - Cpp - - CPP - - 'c++' - - 'C++' - CanonicalDelimiter: '' - BasedOnStyle: google - - Language: TextProto - Delimiters: - - pb - - PB - - proto - - PROTO - EnclosingFunctions: - - EqualsProto - - EquivToProto - - PARSE_PARTIAL_TEXT_PROTO - - PARSE_TEST_PROTO - - PARSE_TEXT_PROTO - - ParseTextOrDie - - ParseTextProtoOrDie - CanonicalDelimiter: '' - BasedOnStyle: google ReflowComments: false -SortIncludes: false -SortUsingDeclarations: false +SortIncludes: Never +SortUsingDeclarations: Never SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: ControlStatements SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 -SpacesInAngles: false +SpacesInAngles: Never SpacesInContainerLiterals: false -SpacesInCStyleCastParentheses: false -SpacesInParentheses: false +SpacesInParens: Never SpacesInSquareBrackets: false -Standard: Cpp11 -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 1 +Standard: c++17 UseTab: Never ... diff --git a/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp b/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp index 358c0b768..c6beb44da 100644 --- a/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp +++ b/examples/blas/compile_time_dispatching/level3/gemm_usm_mklcpu_cublas.cpp @@ -61,7 +61,7 @@ // // is performed and finally the results are post processed. // -void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) { +void run_gemm_example(const sycl::device& cpu_dev, const sycl::device& gpu_dev) { // // Initialize data for Gemm // @@ -89,11 +89,11 @@ void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) // Catch asynchronous exceptions for CPU and GPU auto cpu_exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cerr << "Caught asynchronous SYCL exception on CPU device during GEMM:" << std::endl; std::cerr << "\t" << e.what() << std::endl; @@ -102,11 +102,11 @@ void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) std::exit(2); }; auto gpu_exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cerr << "Caught asynchronous SYCL exception on GPU device during GEMM:" << std::endl; std::cerr << "\t" << e.what() << std::endl; @@ -141,9 +141,9 @@ void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) sycl::context cpu_cxt = cpu_queue.get_context(); // allocate on CPU device and copy data from host to SYCL CPU device - float *cpu_A = sycl::malloc_device(sizea * sizeof(float), cpu_queue); - float *cpu_B = sycl::malloc_device(sizeb * sizeof(float), cpu_queue); - float *cpu_C = sycl::malloc_device(sizec * sizeof(float), cpu_queue); + float* cpu_A = sycl::malloc_device(sizea * sizeof(float), cpu_queue); + float* cpu_B = sycl::malloc_device(sizeb * sizeof(float), cpu_queue); + float* cpu_C = sycl::malloc_device(sizec * sizeof(float), cpu_queue); if (!cpu_A || !cpu_B || !cpu_C) { throw std::runtime_error("Failed to allocate USM memory."); } @@ -159,9 +159,9 @@ void run_gemm_example(const sycl::device &cpu_dev, const sycl::device &gpu_dev) sycl::context gpu_cxt = gpu_queue.get_context(); // allocate on GPU device and copy data from host to SYCL GPU device - float *gpu_A = sycl::malloc_device(sizea * sizeof(float), gpu_queue); - float *gpu_B = sycl::malloc_device(sizeb * sizeof(float), gpu_queue); - float *gpu_C = sycl::malloc_device(sizec * sizeof(float), gpu_queue); + float* gpu_A = sycl::malloc_device(sizea * sizeof(float), gpu_queue); + float* gpu_B = sycl::malloc_device(sizeb * sizeof(float), gpu_queue); + float* gpu_C = sycl::malloc_device(sizec * sizeof(float), gpu_queue); if (!gpu_A || !gpu_B || !gpu_C) { throw std::runtime_error("Failed to allocate USM memory."); } @@ -260,7 +260,7 @@ void print_example_banner() { // // Main entry point for example. // -int main(int argc, char **argv) { +int main(int argc, char** argv) { print_example_banner(); try { @@ -279,13 +279,13 @@ int main(int argc, char **argv) { run_gemm_example(cpu_dev, gpu_dev); std::cout << "BLAS GEMM USM example ran OK on MKLCPU and CUBLAS" << std::endl; } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cerr << "Caught synchronous SYCL exception during GEMM:" << std::endl; std::cerr << "\t" << e.what() << std::endl; std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; return 1; } - catch (std::exception const &e) { + catch (std::exception const& e) { std::cerr << "Caught std::exception during GEMM:"; std::cerr << "\t" << e.what() << std::endl; return 1; diff --git a/examples/include/example_helper.hpp b/examples/include/example_helper.hpp index 4a89e6fae..c5da54acf 100644 --- a/examples/include/example_helper.hpp +++ b/examples/include/example_helper.hpp @@ -88,7 +88,7 @@ fp rand_scalar() { } template -void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) { +void rand_matrix(vec& M, oneapi::mkl::transpose trans, int m, int n, int ld) { using fp = typename vec::value_type; if (trans == oneapi::mkl::transpose::nontrans) { @@ -104,7 +104,7 @@ void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) { } template -intType generate_sparse_matrix(const intType nx, intType *ia, intType *ja, fp *a, +intType generate_sparse_matrix(const intType nx, intType* ia, intType* ja, fp* a, const intType index = 0) { intType nz = nx, ny = nx; intType nnz = 0; @@ -172,7 +172,7 @@ bool check_result(fp res, fp ref, intType nFlops, intType index) { } template -void free_vec(std::vector &ptr_vec, sycl::queue queue) { +void free_vec(std::vector& ptr_vec, sycl::queue queue) { for (auto ptr : ptr_vec) { sycl::free(ptr, queue); } diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp index 316d2c744..2f3be76ed 100644 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -60,7 +60,7 @@ // is performed and finally the results are post processed. // template -int run_sparse_matrix_vector_multiply_example(selectorType &selector) { +int run_sparse_matrix_vector_multiply_example(selectorType& selector) { auto queue = selector.get_queue(); // Matrix data size @@ -75,11 +75,11 @@ int run_sparse_matrix_vector_multiply_example(selectorType &selector) { intType host_ia[] = { 0, 0, 1, 3, 4, 4, 4, 7, 7 }; intType host_ja[] = { 0, 7, 2, 2, 5, 4, 0, 0, 7 }; - intType *ia = (intType *)sycl::malloc_shared(nnz * sizeof(intType), queue); - intType *ja = (intType *)sycl::malloc_shared(nnz * sizeof(intType), queue); - fpType *a = (fpType *)sycl::malloc_shared(nnz * sizeof(fpType), queue); - fpType *x = (fpType *)sycl::malloc_shared(size * sizeof(fpType), queue); - fpType *y = (fpType *)sycl::malloc_shared(size * sizeof(fpType), queue); + intType* ia = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue); + intType* ja = (intType*)sycl::malloc_shared(nnz * sizeof(intType), queue); + fpType* a = (fpType*)sycl::malloc_shared(nnz * sizeof(fpType), queue); + fpType* x = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue); + fpType* y = (fpType*)sycl::malloc_shared(size * sizeof(fpType), queue); if (!ia || !ja || !a || !x || !y) { throw std::runtime_error("Failed to allocate USM memory"); @@ -100,10 +100,10 @@ int run_sparse_matrix_vector_multiply_example(selectorType &selector) { y[i] = set_fp_value(fpType(0.0)); } - std::vector int_ptr_vec; + std::vector int_ptr_vec; int_ptr_vec.push_back(ia); int_ptr_vec.push_back(ja); - std::vector fp_ptr_vec; + std::vector fp_ptr_vec; fp_ptr_vec.push_back(a); fp_ptr_vec.push_back(x); fp_ptr_vec.push_back(y); @@ -148,7 +148,7 @@ int run_sparse_matrix_vector_multiply_example(selectorType &selector) { std::size_t workspace_size = 0; oneapi::mkl::sparse::spmv_buffer_size(selector, transA, &alpha, A_view, A_handle, x_handle, &beta, y_handle, alg, descr, workspace_size); - void *workspace = sycl::malloc_device(workspace_size, queue); + void* workspace = sycl::malloc_device(workspace_size, queue); // Optimize spmv auto ev_opt = @@ -176,7 +176,7 @@ int run_sparse_matrix_vector_multiply_example(selectorType &selector) { // Post Processing // - fpType *res = y; + fpType* res = y; fpType expected_res[size]; const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); for (intType row = 0; row < size; row++) { @@ -243,15 +243,15 @@ void print_example_banner() { // // Main entry point for example // -int main(int /*argc*/, char ** /*argv*/) { +int main(int /*argc*/, char** /*argv*/) { print_example_banner(); auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cout << "Caught asynchronous SYCL " "exception during sparse::spmv:\n" << e.what() << std::endl; @@ -281,13 +281,13 @@ int main(int /*argc*/, char ** /*argv*/) { run_sparse_matrix_vector_multiply_example(gpu_selector); std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl; } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl; std::cerr << "\t" << e.what() << std::endl; std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; return 1; } - catch (std::exception const &e) { + catch (std::exception const& e) { std::cerr << "Caught std::exception during Sparse SPMV:" << std::endl; std::cerr << "\t" << e.what() << std::endl; return 1; diff --git a/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp b/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp index d87297600..69be82745 100644 --- a/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp +++ b/examples/sparse_blas/run_time_dispatching/sparse_blas_spmv_usm.cpp @@ -61,7 +61,7 @@ // is performed and finally the results are post processed. // template -int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { +int run_sparse_matrix_vector_multiply_example(const sycl::device& dev) { // Matrix data size intType size = 4; intType nrows = size * size * size; @@ -72,11 +72,11 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { // Catch asynchronous exceptions auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cout << "Caught asynchronous SYCL " "exception during sparse::spmv:\n" << e.what() << std::endl; @@ -95,12 +95,12 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { std::size_t sizevec = static_cast(nrows); auto sizevec_i64 = static_cast(sizevec); - ia = (intType *)sycl::malloc_shared(sizeia * sizeof(intType), main_queue); - ja = (intType *)sycl::malloc_shared(sizeja * sizeof(intType), main_queue); - a = (fp *)sycl::malloc_shared(sizea * sizeof(fp), main_queue); - x = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); - y = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); - z = (fp *)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); + ia = (intType*)sycl::malloc_shared(sizeia * sizeof(intType), main_queue); + ja = (intType*)sycl::malloc_shared(sizeja * sizeof(intType), main_queue); + a = (fp*)sycl::malloc_shared(sizea * sizeof(fp), main_queue); + x = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); + y = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); + z = (fp*)sycl::malloc_shared(sizevec * sizeof(fp), main_queue); if (!ia || !ja || !a || !x || !y || !z) { throw std::runtime_error("Failed to allocate USM memory"); @@ -115,10 +115,10 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { z[i] = set_fp_value(fp(0.0)); } - std::vector int_ptr_vec; + std::vector int_ptr_vec; int_ptr_vec.push_back(ia); int_ptr_vec.push_back(ja); - std::vector fp_ptr_vec; + std::vector fp_ptr_vec; fp_ptr_vec.push_back(a); fp_ptr_vec.push_back(x); fp_ptr_vec.push_back(y); @@ -160,7 +160,7 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { std::size_t workspace_size = 0; oneapi::mkl::sparse::spmv_buffer_size(main_queue, transA, &alpha, A_view, A_handle, x_handle, &beta, y_handle, alg, descr, workspace_size); - void *workspace = sycl::malloc_device(workspace_size, main_queue); + void* workspace = sycl::malloc_device(workspace_size, main_queue); // Optimize spmv auto ev_opt = @@ -189,7 +189,7 @@ int run_sparse_matrix_vector_multiply_example(const sycl::device &dev) { // Post Processing // - fp *res = y; + fp* res = y; const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); for (intType row = 0; row < nrows; row++) { z[row] *= beta; @@ -258,7 +258,7 @@ void print_example_banner() { // // Main entry point for example // -int main(int /*argc*/, char ** /*argv*/) { +int main(int /*argc*/, char** /*argv*/) { print_example_banner(); try { @@ -279,13 +279,13 @@ int main(int /*argc*/, char ** /*argv*/) { run_sparse_matrix_vector_multiply_example(dev); std::cout << "Sparse BLAS SPMV USM example ran OK." << std::endl; } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cerr << "Caught synchronous SYCL exception during Sparse SPMV:" << std::endl; std::cerr << "\t" << e.what() << std::endl; std::cerr << "\tSYCL error code: " << e.code().value() << std::endl; return 1; } - catch (std::exception const &e) { + catch (std::exception const& e) { std::cerr << "Caught std::exception during Sparse SPMV:" << std::endl; std::cerr << "\t" << e.what() << std::endl; return 1; diff --git a/include/oneapi/mkl/bfloat16.hpp b/include/oneapi/mkl/bfloat16.hpp index afa155b1a..127d5ced4 100644 --- a/include/oneapi/mkl/bfloat16.hpp +++ b/include/oneapi/mkl/bfloat16.hpp @@ -70,7 +70,7 @@ struct bfloat16 { inline bfloat16(float f); bfloat16(double d) : bfloat16(float(d)) {} template - bfloat16(T i, typename std::enable_if::value>::type *_ = nullptr) + bfloat16(T i, typename std::enable_if::value>::type* _ = nullptr) : bfloat16(float(i)) {} inline operator float() const; @@ -101,115 +101,115 @@ struct bfloat16 { return h; } - friend float operator+(const bfloat16 &h1, const bfloat16 &h2) { + friend float operator+(const bfloat16& h1, const bfloat16& h2) { return float(h1) + float(h2); } - friend float operator-(const bfloat16 &h1, const bfloat16 &h2) { + friend float operator-(const bfloat16& h1, const bfloat16& h2) { return float(h1) - float(h2); } - friend float operator*(const bfloat16 &h1, const bfloat16 &h2) { + friend float operator*(const bfloat16& h1, const bfloat16& h2) { return float(h1) * float(h2); } - friend float operator/(const bfloat16 &h1, const bfloat16 &h2) { + friend float operator/(const bfloat16& h1, const bfloat16& h2) { return float(h1) / float(h2); } template friend typename std::enable_if::value, float>::type operator+( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) + float(o); } template friend typename std::enable_if::value, float>::type operator-( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) - float(o); } template friend typename std::enable_if::value, float>::type operator*( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) * float(o); } template friend typename std::enable_if::value, float>::type operator/( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) / float(o); } template friend typename std::enable_if::value, float>::type operator+( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return float(o) + float(h); } template friend typename std::enable_if::value, float>::type operator-( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return float(o) - float(h); } template friend typename std::enable_if::value, float>::type operator*( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return float(o) * float(h); } template friend typename std::enable_if::value, float>::type operator/( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return float(o) / float(h); } template friend typename std::enable_if::value, T>::type operator+( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) + o; } template friend typename std::enable_if::value, T>::type operator-( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) - o; } template friend typename std::enable_if::value, T>::type operator*( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) * o; } template friend typename std::enable_if::value, T>::type operator/( - const bfloat16 &h, const T &o) { + const bfloat16& h, const T& o) { return float(h) / o; } template friend typename std::enable_if::value, T>::type operator+( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return o + float(h); } template friend typename std::enable_if::value, T>::type operator-( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return o - float(h); } template friend typename std::enable_if::value, T>::type operator*( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return o * float(h); } template friend typename std::enable_if::value, T>::type operator/( - const T &o, const bfloat16 &h) { + const T& o, const bfloat16& h) { return o / float(h); } template - bfloat16 operator+=(const T &o) { + bfloat16 operator+=(const T& o) { return *this = bfloat16(*this + o); } template - bfloat16 operator-=(const T &o) { + bfloat16 operator-=(const T& o) { return *this = bfloat16(*this - o); } template - bfloat16 operator*=(const T &o) { + bfloat16 operator*=(const T& o) { return *this = bfloat16(*this * o); } template - bfloat16 operator/=(const T &o) { + bfloat16 operator/=(const T& o) { return *this = bfloat16(*this / o); } }; diff --git a/include/oneapi/mkl/blas.hxx b/include/oneapi/mkl/blas.hxx index 374585912..cb89703fc 100644 --- a/include/oneapi/mkl/blas.hxx +++ b/include/oneapi/mkl/blas.hxx @@ -19,1723 +19,1679 @@ // Buffer APIs -static inline void asum(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { detail::asum(get_device_id(queue), queue, n, x, incx, result); } -static inline void asum(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void asum(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::asum(get_device_id(queue), queue, n, x, incx, result); } -static inline void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::asum(get_device_id(queue), queue, n, x, incx, result); } -static inline void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::asum(get_device_id(queue), queue, n, x, incx, result); } -static inline void axpy(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +static inline void axpy(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy); } -static inline void axpy(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +static inline void axpy(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy) { detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy); } -static inline void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy); } -static inline void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy); } -static inline void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { +static inline void axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { +static inline void axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +static inline void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +static inline void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void axpby(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +static inline void axpby(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy); } -static inline void axpby(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +static inline void axpby(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy) { detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy); } -static inline void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +static inline void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy); } -static inline void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +static inline void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy); } -static inline void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +static inline void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { detail::copy(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +static inline void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { detail::copy(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void copy(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy) { detail::copy(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void copy(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void copy(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { detail::copy(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { +static inline void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, +static inline void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +static inline void copy_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +static inline void copy_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +static inline void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +static inline void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +static inline void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dotc(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +static inline void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& result) { detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dotc(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +static inline void dotc(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dotu(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +static inline void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& result) { detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void dotu(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +static inline void dotu(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result); } -static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +static inline void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +static inline void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +static inline void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +static inline void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc) { +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +static inline void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc) { detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - int8_t ao, sycl::buffer &b, std::int64_t ldb, - uint8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { +static inline void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co) { detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - int8_t ao, sycl::buffer &b, std::int64_t ldb, int8_t bo, - float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { +static inline void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co) { detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - uint8_t ao, sycl::buffer &b, std::int64_t ldb, - int8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { +static inline void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co) { detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -static inline void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - uint8_t ao, sycl::buffer &b, std::int64_t ldb, - uint8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { +static inline void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co) { detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +static inline void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +static inline void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + double beta, sycl::buffer& c, std::int64_t ldc) { detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +static inline void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc) { detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +static inline void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc) { detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +static inline void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +static inline void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy) { detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { +static inline void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, +static inline void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, +static inline void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -static inline void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, +static inline void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +static inline void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { +static inline void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { +static inline void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -static inline void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { +static inline void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -static inline void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +static inline void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +static inline void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda); } -static inline void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -static inline void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -static inline void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +static inline void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +static inline void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& y, std::int64_t incy) { detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +static inline void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda); } -static inline void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +static inline void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda); } -static inline void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -static inline void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +static inline void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda) { detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -static inline void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - double beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, +static inline void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer, 1> &a, - std::int64_t lda, double beta, sycl::buffer, 1> &c, +static inline void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +static inline void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -static inline void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +static inline void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -static inline void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +static inline void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a); } -static inline void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +static inline void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a); } -static inline void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a) { +static inline void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a) { detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a); } -static inline void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a) { +static inline void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a) { detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a); } -static inline void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::iamax(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::iamax(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamax(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void iamax(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::iamax(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamax(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void iamax(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::iamax(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::iamin(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::iamin(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamin(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void iamin(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::iamin(get_device_id(queue), queue, n, x, incx, result); } -static inline void iamin(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void iamin(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::iamin(get_device_id(queue), queue, n, x, incx, result); } -static inline void nrm2(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { detail::nrm2(get_device_id(queue), queue, n, x, incx, result); } -static inline void nrm2(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +static inline void nrm2(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { detail::nrm2(get_device_id(queue), queue, n, x, incx, result); } -static inline void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::nrm2(get_device_id(queue), queue, n, x, incx, result); } -static inline void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +static inline void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { detail::nrm2(get_device_id(queue), queue, n, x, incx, result); } -static inline void rot(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, - float s) { +static inline void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, float c, float s) { detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s); } -static inline void rot(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, - double s) { +static inline void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, double c, double s) { detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s); } -static inline void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, +static inline void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s); } -static inline void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - double c, double s) { +static inline void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, + double s) { detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s); } -static inline void rotg(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +static inline void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { detail::rotg(get_device_id(queue), queue, a, b, c, s); } -static inline void rotg(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +static inline void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { detail::rotg(get_device_id(queue), queue, a, b, c, s); } -static inline void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +static inline void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { detail::rotg(get_device_id(queue), queue, a, b, c, s); } -static inline void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s) { +static inline void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { detail::rotg(get_device_id(queue), queue, a, b, c, s); } -static inline void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +static inline void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param); } -static inline void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +static inline void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param); } -static inline void rotmg(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +static inline void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param) { detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param); } -static inline void rotmg(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, - double y1, sycl::buffer ¶m) { +static inline void rotmg(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param); } -static inline void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +static inline void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -static inline void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +static inline void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy) { detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -static inline void scal(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void scal(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx) { detail::scal(get_device_id(queue), queue, n, alpha, x, incx); } -static inline void sdsdot(sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +static inline void sdsdot(sycl::queue& queue, std::int64_t n, float sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { detail::sdsdot(get_device_id(queue), queue, n, sb, x, incx, y, incy, result); } -static inline void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, - std::int64_t incy) { +static inline void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -static inline void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, - std::int64_t incy) { +static inline void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -static inline void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &a) { +static inline void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a); } -static inline void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &a) { +static inline void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a); } -static inline void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a) { +static inline void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a); } -static inline void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a) { +static inline void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a); } -static inline void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +static inline void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { detail::swap(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +static inline void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { detail::swap(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void swap(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy) { detail::swap(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void swap(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +static inline void swap(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { detail::swap(get_device_id(queue), queue, n, x, incx, y, incy); } -static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc) { +static inline void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - double beta, sycl::buffer &c, std::int64_t ldc) { +static inline void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +static inline void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +static inline void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +static inline void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +static inline void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -static inline void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda) { +static inline void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, + std::int64_t lda) { detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda); } -static inline void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda) { +static inline void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, + std::int64_t lda) { detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda); } -static inline void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +static inline void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -static inline void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +static inline void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc) { +static inline void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - double beta, sycl::buffer &c, std::int64_t ldc) { +static inline void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &c, - std::int64_t ldc) { +static inline void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, - std::int64_t ldc) { +static inline void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + double beta, sycl::buffer& c, std::int64_t ldc) { detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +static inline void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { +static inline void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { +static inline void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, +static inline void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -static inline void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, +static inline void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +static inline void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +static inline void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, + std::int64_t incx) { detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +static inline void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +static inline void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +static inline void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, + std::int64_t incx) { detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +static inline void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +static inline void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +static inline void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +static inline void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +static inline void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx); } -static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +static inline void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +static inline void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx) { +static inline void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx) { +static inline void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb) { detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +static inline void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, +static inline void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, +static inline void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, +static inline void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, +static inline void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size) { detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +static inline void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +static inline void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx) { +static inline void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx) { +static inline void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, +static inline void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, +static inline void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, +static inline void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, +static inline void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -static inline void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -static inline void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +static inline void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb); } -static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +static inline void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb); } -static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, +static inline void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb); } -static inline void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, +static inline void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb); } -static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +static inline void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +static inline void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +static inline void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -static inline void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +static inline void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &ab, std::int64_t lda, +static inline void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb); } -static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &ab, std::int64_t lda, +static inline void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb); } -static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +static inline void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb); } -static inline void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +static inline void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb); } -static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { +static inline void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &c, std::int64_t ldc) { +static inline void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& c, std::int64_t ldc) { detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +static inline void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, sycl::buffer, 1>& c, std::int64_t ldc) { detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -static inline void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +static inline void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, sycl::buffer, 1>& c, std::int64_t ldc) { detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); @@ -1743,603 +1699,590 @@ static inline void omatadd(sycl::queue &queue, transpose transa, transpose trans // USM APIs -static inline sycl::event asum(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}) { +static inline sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}) { auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event asum(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}) { +static inline sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}) { auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}) { +static inline sycl::event asum(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}) { auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}) { +static inline sycl::event asum(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}) { auto done = detail::asum(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpy(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies); return done; } -static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpy(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies); return done; } -static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies); return done; } -static inline sycl::event axpy(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpy(get_device_id(queue), queue, n, alpha, x, incx, y, incy, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, - const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, - const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, - std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, - float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, - double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, std::int64_t stridex, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::axpy_batch(get_device_id(queue), queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpby(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, - double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpby(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event axpby(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::axpby(get_device_id(queue), queue, n, alpha, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event copy(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event copy(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event copy(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event copy(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::copy(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const double** x, + std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event copy_batch(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event copy_batch(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::copy_batch(get_device_id(queue), queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *result, - const std::vector &dependencies = {}) { +static inline sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}) { auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *result, - const std::vector &dependencies = {}) { +static inline sycl::event dot(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}) { auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - double *result, - const std::vector &dependencies = {}) { +static inline sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}) { auto done = detail::dot(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dotc(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}) { +static inline sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}) { auto done = detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dotc(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}) { +static inline sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}) { auto done = detail::dotc(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dotu(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}) { +static inline sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}) { auto done = detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event dotu(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}) { +static inline sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}) { auto done = detail::dotu(get_device_id(queue), queue, n, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gbmv(get_device_id(queue), queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, - std::int64_t ldb, sycl::half beta, sycl::half *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, + std::int64_t ldb, sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const bfloat16 *a, std::int64_t lda, const bfloat16 *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const bfloat16* a, std::int64_t lda, const bfloat16* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemm(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, const float **b, std::int64_t *ldb, - float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, + const float** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, const double **b, std::int64_t *ldb, - double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, + const double** b, std::int64_t* ldb, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch( - sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **b, std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch( - sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **b, std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, sycl::half *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, - sycl::half *beta, sycl::half **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, + sycl::half** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - std::int32_t **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + std::int32_t** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, const float *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, - std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, const double *b, std::int64_t ldb, - std::int64_t stride_b, double beta, double *c, - std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, + double beta, double* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2347,11 +2290,11 @@ static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, trans } static inline sycl::event gemm_batch( - sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}) { + sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2359,226 +2302,219 @@ static inline sycl::event gemm_batch( } static inline sycl::event gemm_batch( - sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}) { + sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, - std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::half *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - float beta, std::int32_t *c, std::int64_t ldc, + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::gemm_batch(get_device_id(queue), queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::gemmt(get_device_id(queue), queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}) { auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); return done; } -static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}) { auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); return done; } -static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, - std::int64_t lda, std::uint8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}) { auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); return done; } -static inline sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, - std::int64_t lda, std::uint8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}) { +static inline sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, + offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}) { auto done = detail::gemm_bias(get_device_id(queue), queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); return done; } -static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::gemv(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, - float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, - double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double beta, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2586,11 +2522,11 @@ static inline sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::i } static inline sycl::event gemv_batch( - sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}) { + sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2598,1538 +2534,1508 @@ static inline sycl::event gemv_batch( } static inline sycl::event gemv_batch( - sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}) { + sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, - float *beta, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, + float* beta, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, - double *beta, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, + double* beta, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, + std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, + std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::gemv_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, + std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, + std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, + std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, + std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::dgmm_batch(get_device_id(queue), queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::ger(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::gerc(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::geru(get_device_id(queue), queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::hemm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hemv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } -static inline sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::her(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } -static inline sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::her2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::her2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, - const std::complex *a, std::int64_t lda, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, + const std::complex* a, std::int64_t lda, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - const std::complex *a, std::int64_t lda, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, + const std::complex* a, std::int64_t lda, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::herk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::hpmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies = {}) { +static inline sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies = {}) { auto done = detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); return done; } -static inline sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies = {}) { +static inline sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies = {}) { auto done = detail::hpr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); return done; } -static inline sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, - const std::vector &dependencies = {}) { +static inline sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}) { auto done = detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -static inline sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, - const std::vector &dependencies = {}) { +static inline sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}) { auto done = detail::hpr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamax(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamax(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamax(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamax(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamin(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamin(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event iamin(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}) { +static inline sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}) { auto done = detail::iamin(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}) { +static inline sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}) { auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}) { +static inline sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}) { auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}) { +static inline sycl::event nrm2(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}) { auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}) { +static inline sycl::event nrm2(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}) { auto done = detail::nrm2(get_device_id(queue), queue, n, x, incx, result, dependencies); return done; } -static inline sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - float c, float s, - const std::vector &dependencies = {}) { +static inline sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, + float s, const std::vector& dependencies = {}) { auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies); return done; } -static inline sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - double c, double s, - const std::vector &dependencies = {}) { +static inline sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + double c, double s, + const std::vector& dependencies = {}) { auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies); return done; } -static inline sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies = {}) { +static inline sycl::event rot(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies = {}) { auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies); return done; } -static inline sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, - double s, const std::vector &dependencies = {}) { +static inline sycl::event rot(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies = {}) { auto done = detail::rot(get_device_id(queue), queue, n, x, incx, y, incy, c, s, dependencies); return done; } -static inline sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies = {}) { +static inline sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies = {}) { auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies); return done; } -static inline sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, - double *s, - const std::vector &dependencies = {}) { +static inline sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies = {}) { auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies); return done; } -static inline sycl::event rotg(sycl::queue &queue, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies = {}) { +static inline sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, + float* c, std::complex* s, + const std::vector& dependencies = {}) { auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies); return done; } -static inline sycl::event rotg(sycl::queue &queue, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies = {}) { +static inline sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, + double* c, std::complex* s, + const std::vector& dependencies = {}) { auto done = detail::rotg(get_device_id(queue), queue, a, b, c, s, dependencies); return done; } -static inline sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies = {}) { +static inline sycl::event rotm(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, float* param, + const std::vector& dependencies = {}) { auto done = detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param, dependencies); return done; } -static inline sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies = {}) { +static inline sycl::event rotm(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, double* param, + const std::vector& dependencies = {}) { auto done = detail::rotm(get_device_id(queue), queue, n, x, incx, y, incy, param, dependencies); return done; } -static inline sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, - const std::vector &dependencies = {}) { +static inline sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, + float* param, const std::vector& dependencies = {}) { auto done = detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param, dependencies); return done; } -static inline sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, - double y1, double *param, - const std::vector &dependencies = {}) { +static inline sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, + double* param, const std::vector& dependencies = {}) { auto done = detail::rotmg(get_device_id(queue), queue, d1, d2, x1, y1, param, dependencies); return done; } -static inline sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::sbmv(get_device_id(queue), queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, float* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, double* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::scal(get_device_id(queue), queue, n, alpha, x, incx, dependencies); return done; } -static inline sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies = {}) { +static inline sycl::event sdsdot(sycl::queue& queue, std::int64_t n, float sb, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies = {}) { auto done = detail::sdsdot(get_device_id(queue), queue, n, sb, x, incx, y, incy, result, dependencies); return done; } -static inline sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::spmv(get_device_id(queue), queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies = {}) { +static inline sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* a, + const std::vector& dependencies = {}) { auto done = detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); return done; } -static inline sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies = {}) { +static inline sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* a, + const std::vector& dependencies = {}) { auto done = detail::spr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, dependencies); return done; } -static inline sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, - const std::vector &dependencies = {}) { +static inline sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies = {}) { auto done = detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -static inline sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, - const std::vector &dependencies = {}) { +static inline sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, + const std::vector& dependencies = {}) { auto done = detail::spr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -static inline sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event swap(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event swap(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::swap(get_device_id(queue), queue, n, x, incx, y, incy, dependencies); return done; } -static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, + float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, - double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, + double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::symm(get_device_id(queue), queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}) { +static inline sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}) { auto done = detail::symv(get_device_id(queue), queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -static inline sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } -static inline sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::syr(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } -static inline sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}) { +static inline sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}) { auto done = detail::syr2(get_device_id(queue), queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, + float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, + double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syr2k(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::syrk(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, - float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, + std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, - double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, + std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - float beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - double beta, double *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::syrk_batch(get_device_id(queue), queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tbsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } -static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, - std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, + std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::tpsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } -static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trmm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trmv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::trsm(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, - std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, - std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, + std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, - std::int64_t *m, std::int64_t *n, float *alpha, - const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -static inline sycl::event trsm_batch(sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, - std::int64_t *m, std::int64_t *n, double *alpha, - const double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -static inline sycl::event trsm_batch( - sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, diag *unit_diag, - std::int64_t *m, std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -static inline sycl::event trsm_batch( - sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, diag *unit_diag, - std::int64_t *m, std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}) { +static inline sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}) { auto done = detail::trsm_batch(get_device_id(queue), queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}) { +static inline sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}) { auto done = detail::trsv(get_device_id(queue), queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, +static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, +static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, +static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } -static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, +static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } -static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } -static inline sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy_batch(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } -static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, +static inline sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, + const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - double beta, const double *b, std::int64_t ldb, - std::int64_t stride_b, double *c, std::int64_t ldc, + const double* a, std::int64_t lda, std::int64_t stride_a, + double beta, const double* b, std::int64_t ldb, + std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4137,185 +4043,185 @@ static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, tr } static inline sycl::event omatadd_batch( - sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}) { + sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}) { auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex *c, + const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatadd_batch(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); return done; } -static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - float *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - double *b, std::int64_t ldb, - const std::vector &dependencies = {}) { +static inline sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}) { + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::omatcopy(get_device_id(queue), queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, +static inline sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } -static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, +static inline sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } -static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } -static inline sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::omatcopy2(get_device_id(queue), queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } -static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, +static inline sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } -static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, +static inline sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } -static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } -static inline sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, +static inline sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}) { auto done = detail::imatcopy(get_device_id(queue), queue, trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } -static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, std::int64_t ldb, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, std::int64_t ldb, + float* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); return done; } -static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, std::int64_t ldb, - double *c, std::int64_t ldc, - const std::vector &dependencies = {}) { +static inline sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, std::int64_t ldb, + double* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); return done; } -static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, const std::complex *b, - std::int64_t ldb, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { + const std::complex* a, std::int64_t lda, + std::complex beta, const std::complex* b, + std::int64_t ldb, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); return done; } -static inline sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, +static inline sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, const std::complex *b, - std::int64_t ldb, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}) { + const std::complex* a, std::int64_t lda, + std::complex beta, const std::complex* b, + std::int64_t ldb, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}) { auto done = detail::omatadd(get_device_id(queue), queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); return done; diff --git a/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx b/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx index afebb93c3..cd03497d6 100644 --- a/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx +++ b/include/oneapi/mkl/blas/detail/blas_ct_backends.hxx @@ -20,2892 +20,2813 @@ // Buffer APIs static inline void syr2(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); static inline void syr2(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); static inline void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& x, std::int64_t incx); static inline void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& x, std::int64_t incx); static inline void scal(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx); static inline void scal(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx); static inline void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& x, std::int64_t incx); static inline void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& x, std::int64_t incx); static inline void trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); static inline void trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); static inline void trmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void trmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); static inline void tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); static inline void tpmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); static inline void tpmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); static inline void spr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); static inline void spr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); static inline void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc); static inline void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc); static inline void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); static inline void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); static inline void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + float beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); static inline void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + double beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); static inline void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void her2(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); static inline void her2(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); static inline void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s); static inline void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s); static inline void rot(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, float c, float s); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, float c, float s); static inline void rot(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, double c, double s); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, double c, double s); static inline void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void axpy(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); static inline void axpy(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); static inline void axpy_batch(backend_selector selector, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + float alpha, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); static inline void axpy_batch(backend_selector selector, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + double alpha, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); static inline void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); static inline void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); static inline void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); static inline void axpby(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); static inline void axpby(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); static inline void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); static inline void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); static inline void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); static inline void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); static inline void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); static inline void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); static inline void gemv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void gemv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + float beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); static inline void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, double beta, - sycl::buffer &y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); static inline void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); static inline void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); static inline void dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size); + std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); static inline void dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, + std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); static inline void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); static inline void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); static inline void her(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, + std::int64_t n, float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); static inline void her(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, + std::int64_t n, double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); static inline void hpr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); + std::int64_t n, float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); static inline void hpr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); + std::int64_t n, double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); static inline void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::half alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void spmv(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); + std::int64_t n, float alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void spmv(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + std::int64_t n, double alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); static inline void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); static inline void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); static inline void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); static inline void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); static inline void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); static inline void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); static inline void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); static inline void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); static inline void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc); + sycl::half alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); static inline void herk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); static inline void herk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); static inline void ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); static inline void ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); static inline void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); static inline void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); static inline void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); static inline void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); static inline void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); static inline void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); static inline void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); static inline void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); static inline void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); static inline void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); static inline void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); static inline void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); static inline void tbmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); static inline void tbmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); static inline void tbmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void tbmv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); static inline void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); static inline void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); static inline void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); static inline void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); static inline void syr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); static inline void syr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); static inline void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); static inline void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); static inline void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); static inline void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -static inline void rotmg(backend_selector selector, - sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, - sycl::buffer ¶m); +static inline void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param); -static inline void rotmg(backend_selector selector, - sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, - sycl::buffer ¶m); +static inline void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param); static inline void tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); static inline void tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); static inline void tpsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); static inline void tpsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); static inline void trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); static inline void trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); static inline void trsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void trsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); static inline void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); static inline void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); static inline void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); static inline void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); static inline void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); static inline void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); static inline void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); static inline void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); static inline void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); static inline void sbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void sbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); static inline void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void tbsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); static inline void tbsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); static inline void tbsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void tbsv(backend_selector selector, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); static inline void spr2(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); static inline void spr2(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); static inline void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); static inline void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); static inline void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size); + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size); + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param); static inline void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param); -static inline void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); +static inline void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); -static inline void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); +static inline void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); static inline void rotg(backend_selector selector, - sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); static inline void rotg(backend_selector selector, - sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); static inline void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); static inline void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); static inline void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, + std::int64_t ldb, double beta, sycl::buffer, 1>& c, + std::int64_t ldc); static inline void dot(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); static inline void dot(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); static inline void dot(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); static inline void symv(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); static inline void symv(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); static inline void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); static inline void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); static inline void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); static inline void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); static inline void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); static inline void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); static inline void omatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); static inline void omatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); static inline void omatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); static inline void omatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); static inline void omatcopy2(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &b, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); static inline void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &b, std::int64_t ldb, std::int64_t strideb); + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); static inline void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); static inline void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); static inline void imatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &ab, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); static inline void imatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb); + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); static inline void imatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); static inline void imatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); static inline void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); static inline void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); static inline void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); static inline void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // USM APIs static inline sycl::event syr2(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event syr2(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - float alpha, float *x, std::int64_t incx, - const std::vector &dependencies = {}); + float alpha, float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - double alpha, double *x, std::int64_t incx, - const std::vector &dependencies = {}); + double alpha, double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + std::complex alpha, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - float alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + float alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event scal(backend_selector selector, std::int64_t n, - double alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + double alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const float *a, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const float* a, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const double *a, - double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const double* a, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event spr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *a, const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* a, const std::vector& dependencies = {}); static inline sycl::event spr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - double *a, const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* x, std::int64_t incx, + double* a, const std::vector& dependencies = {}); static inline sycl::event hpmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event hpmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, + std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event syrk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); static inline sycl::event syrk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies = {}); static inline sycl::event syrk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); static inline sycl::event syrk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, + double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, std::int64_t stride_a, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, + double beta, double* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); -static inline sycl::event syrk_batch(backend_selector selector, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event syrk_batch(backend_selector selector, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); +static inline sycl::event syrk_batch(backend_selector selector, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event her2(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event her2(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event hbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event hbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); static inline sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, double c, double s, - const std::vector &dependencies = {}); + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, double c, double s, + const std::vector& dependencies = {}); -static inline sycl::event rot(backend_selector selector, std::int64_t n, - float *x, std::int64_t incx, float *y, std::int64_t incy, float c, - float s, const std::vector &dependencies = {}); +static inline sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); static inline sycl::event rot(backend_selector selector, std::int64_t n, - double *x, std::int64_t incx, double *y, std::int64_t incy, - double c, double s, - const std::vector &dependencies = {}); + double* x, std::int64_t incx, double* y, std::int64_t incy, double c, + double s, const std::vector& dependencies = {}); static inline sycl::event axpy(backend_selector selector, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); + float alpha, const float* x, std::int64_t incx, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpy(backend_selector selector, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); + double alpha, const double* x, std::int64_t incx, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t *n, float *alpha, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t *n, double *alpha, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event axpy_batch(backend_selector selector, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + float* alpha, const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + double* alpha, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t n, + float alpha, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t n, + double alpha, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event axpy_batch(backend_selector selector, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event axpby(backend_selector selector, std::int64_t n, - float alpha, const float *x, std::int64_t incx, - const float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); + float alpha, const float* x, std::int64_t incx, const float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpby(backend_selector selector, std::int64_t n, - double alpha, const double *x, std::int64_t incx, - const double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + double alpha, const double* x, std::int64_t incx, const double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gerc(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event gerc(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event syr2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event syr2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event syr2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event syr2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gemv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -static inline sycl::event gemv_batch(backend_selector selector, - transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +static inline sycl::event gemv_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float beta, float* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -static inline sycl::event gemv_batch(backend_selector selector, - transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); +static inline sycl::event gemv_batch(backend_selector selector, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double beta, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event gemv_batch( backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event gemv_batch( backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); - -static inline sycl::event gemv_batch(backend_selector selector, - transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemv_batch(backend_selector selector, - transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemv_batch(backend_selector selector, - transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemv_batch( - backend_selector selector, transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, - const float *x, std::int64_t incx, std::int64_t stridex, - float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); + +static inline sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, + const float** a, std::int64_t* lda, const float** x, + std::int64_t* incx, float* beta, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, + const double** a, std::int64_t* lda, const double** x, + std::int64_t* incx, double* beta, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side left_right, + std::int64_t m, std::int64_t n, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); -static inline sycl::event dgmm_batch(backend_selector selector, - side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, - const double *x, std::int64_t incx, std::int64_t stridex, - double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side *left_right, std::int64_t *m, std::int64_t *n, - const float **a, std::int64_t *lda, const float **x, - std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side *left_right, std::int64_t *m, std::int64_t *n, - const double **a, std::int64_t *lda, const double **x, - std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event dgmm_batch(backend_selector selector, - side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); +static inline sycl::event dgmm_batch(backend_selector selector, side left_right, + std::int64_t m, std::int64_t n, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, + double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); static inline sycl::event her(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const std::complex *x, - std::int64_t incx, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const std::complex* x, + std::int64_t incx, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event her(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const std::complex *x, - std::int64_t incx, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const std::complex* x, + std::int64_t incx, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event hpr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const std::complex *x, - std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const std::complex* x, + std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); static inline sycl::event hpr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const std::complex *x, - std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const std::complex* x, + std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); static inline sycl::event iamin(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamin(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, const float** b, std::int64_t* ldb, + float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, const double** b, std::int64_t* ldb, + double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, sycl::half* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, + sycl::half* beta, sycl::half** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, + float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, + float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, + float* beta, std::int32_t** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); -static inline sycl::event gemm_batch( - backend_selector selector, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, sycl::half *alpha, - const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, - float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, - float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, - float *beta, std::int32_t **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_batch(backend_selector selector, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double beta, double *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); +static inline sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, + std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event gemm_batch( backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event gemm_batch( backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, std::int64_t ldc, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -static inline sycl::event gemm_batch( - backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); +static inline sycl::event gemm_batch(backend_selector selector, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, + sycl::half* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, + std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event spmv(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *a, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* a, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event spmv(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *a, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* a, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event swap(backend_selector selector, std::int64_t n, - float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); + float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event swap(backend_selector selector, std::int64_t n, - double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event geru(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event geru(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); static inline sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); static inline sycl::event nrm2(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); static inline sycl::event nrm2(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - const sycl::half *b, std::int64_t ldb, sycl::half beta, - sycl::half *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, sycl::half beta, + sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, - const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); static inline sycl::event gemm(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const bfloat16 *a, std::int64_t lda, - const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const bfloat16* a, std::int64_t lda, const bfloat16* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event herk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const std::complex *a, std::int64_t lda, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const std::complex* a, std::int64_t lda, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event herk(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const std::complex *a, std::int64_t lda, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const std::complex* a, std::int64_t lda, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event ger(backend_selector selector, std::int64_t m, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event ger(backend_selector selector, std::int64_t m, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event trsm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trsm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trsm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trsm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event trsm_batch(backend_selector selector, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, - const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, + int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, + int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, int64_t m, + int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, int64_t m, + int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, + int64_t* m, int64_t* n, float* alpha, const float** a, + int64_t* lda, float** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, + int64_t* m, int64_t* n, double* alpha, const double** a, + int64_t* lda, double** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, + int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, + int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); static inline sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); static inline sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); static inline sycl::event hemm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event hemm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event hpr2(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); static inline sycl::event hpr2(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); static inline sycl::event gbmv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gbmv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gbmv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gbmv(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event tbmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbmv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event symm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event symm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event symm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event symm(backend_selector selector, side left_right, - uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); static inline sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); static inline sycl::event syr(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event syr(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - double *a, std::int64_t lda, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* x, std::int64_t incx, + double* a, std::int64_t lda, + const std::vector& dependencies = {}); static inline sycl::event trmm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trmm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trmm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event trmm(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -static inline sycl::event rotmg(backend_selector selector, float *d1, - float *d2, float *x1, float y1, float *param, - const std::vector &dependencies = {}); +static inline sycl::event rotmg(backend_selector selector, float* d1, float* d2, + float* x1, float y1, float* param, + const std::vector& dependencies = {}); -static inline sycl::event rotmg(backend_selector selector, double *d1, - double *d2, double *x1, double y1, double *param, - const std::vector &dependencies = {}); +static inline sycl::event rotmg(backend_selector selector, double* d1, double* d2, + double* x1, double y1, double* param, + const std::vector& dependencies = {}); static inline sycl::event tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const float *a, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const float* a, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const double *a, - double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const double* a, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tpsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event trsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event copy(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event copy(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t *n, const float **x, std::int64_t *incx, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t *n, const double **x, std::int64_t *incx, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t n, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -static inline sycl::event copy_batch(backend_selector selector, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t stridex, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +static inline sycl::event copy_batch(backend_selector selector, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); static inline sycl::event hemv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event hemv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -static inline sycl::event gemm_bias( - backend_selector selector, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -static inline sycl::event gemm_bias( - backend_selector selector, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao, - const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -static inline sycl::event gemm_bias( - backend_selector selector, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a, int64_t lda, - std::uint8_t ao, const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, - int64_t ldc, const std::int32_t *co, const std::vector &dependencies = {}); - -static inline sycl::event gemm_bias(backend_selector selector, - transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, - const std::uint8_t *a, int64_t lda, std::uint8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, + std::int8_t ao, const std::uint8_t* b, int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, + std::int8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, int64_t m, int64_t n, + int64_t k, float alpha, const std::uint8_t* a, int64_t lda, + std::uint8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +static inline sycl::event gemm_bias(backend_selector selector, transpose transa, + transpose transb, offset offsetc, int64_t m, int64_t n, + int64_t k, float alpha, const std::uint8_t* a, int64_t lda, + std::uint8_t ao, const std::uint8_t* b, int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); static inline sycl::event sbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event sbmv(backend_selector selector, uplo upper_lower, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); static inline sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); static inline sycl::event asum(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); static inline sycl::event asum(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); static inline sycl::event tbsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event tbsv(backend_selector selector, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); static inline sycl::event spr2(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, + const std::vector& dependencies = {}); static inline sycl::event spr2(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, + const std::vector& dependencies = {}); static inline sycl::event iamax(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamax(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); static inline sycl::event rotm(backend_selector selector, std::int64_t n, - float *x, std::int64_t incx, float *y, std::int64_t incy, - float *param, - const std::vector &dependencies = {}); + float* x, std::int64_t incx, float* y, std::int64_t incy, + float* param, const std::vector& dependencies = {}); static inline sycl::event rotm(backend_selector selector, std::int64_t n, - double *x, std::int64_t incx, double *y, std::int64_t incy, - double *param, - const std::vector &dependencies = {}); + double* x, std::int64_t incx, double* y, std::int64_t incy, + double* param, const std::vector& dependencies = {}); -static inline sycl::event rotg(backend_selector selector, float *a, float *b, - float *c, float *s, - const std::vector &dependencies = {}); +static inline sycl::event rotg(backend_selector selector, float* a, float* b, + float* c, float* s, + const std::vector& dependencies = {}); -static inline sycl::event rotg(backend_selector selector, double *a, - double *b, double *c, double *s, - const std::vector &dependencies = {}); +static inline sycl::event rotg(backend_selector selector, double* a, double* b, + double* c, double* s, + const std::vector& dependencies = {}); -static inline sycl::event rotg(backend_selector selector, - std::complex *a, std::complex *b, float *c, - std::complex *s, - const std::vector &dependencies = {}); +static inline sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies = {}); -static inline sycl::event rotg(backend_selector selector, - std::complex *a, std::complex *b, double *c, - std::complex *s, - const std::vector &dependencies = {}); +static inline sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies = {}); static inline sycl::event sdsdot(backend_selector selector, std::int64_t n, - float sb, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies = {}); + float sb, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* result, + const std::vector& dependencies = {}); static inline sycl::event her2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event her2k(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, double beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event dot(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies = {}); static inline sycl::event dot(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *result, - const std::vector &dependencies = {}); + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* result, + const std::vector& dependencies = {}); static inline sycl::event dot(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, double *result, - const std::vector &dependencies = {}); + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + double* result, const std::vector& dependencies = {}); static inline sycl::event symv(backend_selector selector, uplo upper_lower, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event symv(backend_selector selector, uplo upper_lower, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); + std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); static inline sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, + float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, + double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - float alpha, float *ab, std::int64_t lda, std::int64_t ldb, + float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - double alpha, double *ab, std::int64_t lda, + double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, float alpha, const float *a, + std::int64_t n, float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, + const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, double alpha, const double *a, + std::int64_t n, double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double *c, std::int64_t ldc, std::int64_t stride_c, + const double* b, std::int64_t ldb, std::int64_t stride_b, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatadd_batch( backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); + std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}); static inline sycl::event omatadd_batch( backend_selector selector, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex *c, std::int64_t ldc, + std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, const std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event omatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event omatcopy2(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, float *b, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy2(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, double *b, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); static inline sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); static inline sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float beta, const float *b, - std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const float* a, std::int64_t lda, float beta, const float* b, + std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double beta, const double *b, - std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const double* a, std::int64_t lda, double beta, const double* b, + std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); static inline sycl::event omatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, diff --git a/include/oneapi/mkl/blas/detail/blas_loader.hxx b/include/oneapi/mkl/blas/detail/blas_loader.hxx index 98d93b2ad..22ef22283 100644 --- a/include/oneapi/mkl/blas/detail/blas_loader.hxx +++ b/include/oneapi/mkl/blas/detail/blas_loader.hxx @@ -19,2632 +19,2489 @@ // Buffer APIs -ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); +ONEMKL_EXPORT void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float alpha, sycl::buffer, 1> &x, - std::int64_t incx); -ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double alpha, sycl::buffer, 1> &x, - std::int64_t incx); - -ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - -ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx); + +ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); -ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); -ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + +ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); -ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); +ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); +ONEMKL_EXPORT void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::half alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + float beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); +ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + double beta, sycl::buffer& c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size); +ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); -ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); +ONEMKL_EXPORT void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, +ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s); -ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, +ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s); -ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, float c, float s); -ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, double c, double s); - -ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, +ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, float c, float s); +ONEMKL_EXPORT void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, double c, double s); + +ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, +ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy); +ONEMKL_EXPORT void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, +ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); -ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, + sycl::buffer, 1>& y, std::int64_t incy); +ONEMKL_EXPORT void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); -ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); +ONEMKL_EXPORT void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); +ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer, 1>& c, std::int64_t ldc); + +ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + float beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, double beta, - sycl::buffer &y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size); +ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, +ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, +ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, +ONEMKL_EXPORT void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); -ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); +ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); +ONEMKL_EXPORT void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); -ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); -ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); +ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); -ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); +ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); -ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); +ONEMKL_EXPORT void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); - -ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - -ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); + +ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + +ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, - sycl::buffer ¶m); -ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, - sycl::buffer ¶m); - -ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); -ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - -ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param); +ONEMKL_EXPORT void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, + sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param); + +ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); +ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); +ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); +ONEMKL_EXPORT void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + +ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); -ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); +ONEMKL_EXPORT void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); - -ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); + +ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); +ONEMKL_EXPORT void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + sycl::buffer, 1>& c, std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + sycl::half alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); -ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); -ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + +ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); +ONEMKL_EXPORT void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); + +ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); +ONEMKL_EXPORT void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); + +ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); -ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); +ONEMKL_EXPORT void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); -ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - -ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); +ONEMKL_EXPORT void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + +ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); -ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); +ONEMKL_EXPORT void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); -ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); -ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); +ONEMKL_EXPORT void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); +ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); +ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); +ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); -ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); +ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); -ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - -ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); -ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + +ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); +ONEMKL_EXPORT void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + +ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); +ONEMKL_EXPORT void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); + +ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); -ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); +ONEMKL_EXPORT void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); -ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); -ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + +ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); +ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - -ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); -ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - -ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); +ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); +ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); +ONEMKL_EXPORT void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + +ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - -ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); -ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); -ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); - -ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + +ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); +ONEMKL_EXPORT void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); +ONEMKL_EXPORT void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); + +ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); +ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); +ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); +ONEMKL_EXPORT void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - -ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); -ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); - -ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + +ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); +ONEMKL_EXPORT void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); + +ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); +ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); +ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +ONEMKL_EXPORT void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); -ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); - -ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); -ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); -ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - -ONEMKL_EXPORT void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float sb, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - -ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param); +ONEMKL_EXPORT void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param); + +ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); +ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); +ONEMKL_EXPORT void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); + +ONEMKL_EXPORT void sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result); + +ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); +ONEMKL_EXPORT void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); -ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); -ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); -ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue &queue, - sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); - -ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, + std::int64_t ldb, double beta, sycl::buffer, 1>& c, + std::int64_t ldc); + +ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); +ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); +ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue& queue, + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); +ONEMKL_EXPORT void rotg(oneapi::mkl::device libkey, sycl::queue& queue, + sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); + +ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); +ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); +ONEMKL_EXPORT void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &b, +ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); +ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &ab, +ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); +ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +ONEMKL_EXPORT void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); +ONEMKL_EXPORT void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // USM APIs -ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float alpha, float *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double alpha, double *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, sycl::half *alpha, - const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - std::int32_t **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double beta, double *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const std::complex* a, std::int64_t lda, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const std::complex* a, std::int64_t lda, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const float* a, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const double* a, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* a, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* x, std::int64_t incx, + double* a, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, sycl::half* alpha, + const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + std::int32_t** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, const float* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, + std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemm_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, + oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemm_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, + oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, std::int64_t stride_c, + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - float beta, std::int32_t *c, std::int64_t ldc, + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo *upper_lower, transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, double c, double s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - double *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, float *alpha, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, double *alpha, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, + uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, double c, double s, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float* x, std::int64_t incx, float* y, std::int64_t incy, float c, + float s, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double* x, std::int64_t incx, double* y, std::int64_t incy, double c, + double s, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, float* alpha, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, double* alpha, const double** x, + std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, + float beta, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, + double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemv_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, - transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); + oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemv_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, - const float *x, std::int64_t incx, std::int64_t stridex, - float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, - const double *x, std::int64_t incx, std::int64_t stridex, - double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, std::int64_t *m, std::int64_t *n, - const float **a, std::int64_t *lda, const float **x, - std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, std::int64_t *m, std::int64_t *n, - const double **a, std::int64_t *lda, const double **x, - std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *a, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *a, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1, - float *d2, float *x1, float y1, float *param, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1, - double *d2, double *x1, double y1, double *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, - double *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, - std::int64_t ldb, sycl::half beta, sycl::half *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, - const std::int8_t *a, int64_t lda, std::int8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_bias( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, int64_t lda, - std::int8_t ao, const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, - int64_t ldc, const std::int32_t *co, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, - const std::uint8_t *a, int64_t lda, std::uint8_t ao, - const std::int8_t *b, int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, - transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, - const std::uint8_t *a, int64_t lda, std::uint8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, - std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, - std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex *b, - int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, - const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, - transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, - side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, const float **x, std::int64_t *incx, - float **y, std::int64_t *incy, int64_t group_count, - int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, const double **x, std::int64_t *incx, - double **y, std::int64_t *incy, int64_t group_count, - int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, int64_t group_count, - int64_t *group_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, int64_t group_count, - int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float *param, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, double *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *result, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float sb, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, - uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a, - float *b, float *c, float *s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a, - double *b, double *c, double *s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, - std::complex *a, std::complex *b, float *c, - std::complex *s, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, - std::complex *a, std::complex *b, double *c, - std::complex *s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float* beta, float** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double* beta, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, + transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, std::int64_t m, std::int64_t n, + const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, + float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, std::int64_t m, std::int64_t n, + const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, + double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, std::int64_t* m, std::int64_t* n, + const float** a, std::int64_t* lda, const float** x, + std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, std::int64_t* m, std::int64_t* n, + const double** a, std::int64_t* lda, const double** x, + std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const std::complex* x, + std::int64_t incx, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const std::complex* x, + std::int64_t incx, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const std::complex* x, + std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const std::complex* x, + std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, + std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* a, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* a, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, float* d1, + float* d2, float* x1, float y1, float* param, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, double* d1, + double* d2, double* x1, double y1, double* param, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose transa, transpose transb, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, sycl::half beta, + sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const bfloat16* a, std::int64_t lda, const bfloat16* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, const std::int8_t* a, + int64_t lda, std::int8_t ao, const std::uint8_t* b, int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, const std::int8_t* a, + int64_t lda, std::int8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, const std::uint8_t* a, + int64_t lda, std::uint8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, + transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, const std::uint8_t* a, + int64_t lda, std::uint8_t ao, const std::uint8_t* b, + int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, + int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, std::complex* b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, std::complex* b, + int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, float** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, double** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* x, std::int64_t incx, + double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const float* a, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const double* a, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const float* a, + std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, const double* a, + std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, const float** x, std::int64_t* incx, + float** y, std::int64_t* incy, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, int64_t group_count, + int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t stridex, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, float* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, std::int64_t n, std::int64_t k, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + std::int64_t n, double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float* x, std::int64_t incx, float* y, std::int64_t incy, + float* param, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double* x, std::int64_t incx, double* y, std::int64_t incy, + double* param, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* result, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + double* result, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float sb, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, float* a, float* b, + float* c, float* s, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, double* a, double* b, + double* c, double* s, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, + std::complex* a, std::complex* b, float* c, + std::complex* s, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, + std::complex* a, std::complex* b, double* c, + std::complex* s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, + float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, + double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, float *ab, std::int64_t lda, std::int64_t ldb, + float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, double *ab, std::int64_t lda, + double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, float alpha, const float *a, + std::int64_t n, float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, + const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, double alpha, const double *a, + std::int64_t n, double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double *c, std::int64_t ldc, std::int64_t stride_c, + const double* b, std::int64_t ldb, std::int64_t stride_b, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatadd_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex *c, std::int64_t ldc, + oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, const std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatadd_batch( - oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, std::complex *c, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, float *b, +ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, double *b, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, +ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float beta, const float *b, - std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + const float* a, std::int64_t lda, float beta, const float* b, + std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double beta, const double *b, - std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + const double* a, std::int64_t lda, double beta, const double* b, + std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, diff --git a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx index 9483a66c1..d5678917e 100644 --- a/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/cublas/blas_ct.hxx @@ -18,122 +18,121 @@ **************************************************************************/ void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, + std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -142,10 +141,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - double beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -153,10 +152,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -165,11 +163,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -177,9 +174,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -188,9 +185,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -199,9 +196,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -210,9 +207,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -220,38 +217,38 @@ void gemm_batch(backend_selector selector, transpose transa, tr } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -259,10 +256,9 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); @@ -270,9 +266,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -281,249 +276,240 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -531,22 +517,21 @@ void gemv_batch(backend_selector selector, transpose trans, std } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); @@ -554,52 +539,49 @@ void gemv_batch(backend_selector selector, transpose trans, std void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size) { + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -607,277 +589,268 @@ void dgmm_batch(backend_selector selector, side left_right, std } void her(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::cublas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a, - std::int64_t lda) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -885,7 +858,7 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -893,190 +866,187 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); @@ -1084,7 +1054,7 @@ void syr(backend_selector selector, uplo upper_lower, std::int6 void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1092,7 +1062,7 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1100,335 +1070,328 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::cublas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1437,8 +1400,8 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1447,10 +1410,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1458,10 +1420,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1469,72 +1430,69 @@ void trsm_batch(backend_selector selector, side left_right, upl void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::cublas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1542,8 +1500,8 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1551,22 +1509,22 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1574,7 +1532,7 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1582,16 +1540,16 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1599,9 +1557,9 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1610,9 +1568,9 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1621,10 +1579,10 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::cublas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1632,123 +1590,123 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::cublas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1756,253 +1714,250 @@ void omatadd(backend_selector selector, transpose transa, trans // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies) { + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -2010,11 +1965,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper_l } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2022,11 +1976,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2034,12 +1987,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2047,12 +1999,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2060,28 +2011,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2089,126 +2040,124 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, s } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const float* x, std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2216,9 +2165,9 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const double* x, std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2226,10 +2175,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2237,10 +2186,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2248,61 +2197,61 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2310,9 +2259,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2320,10 +2269,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2331,11 +2280,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2343,231 +2291,222 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2575,164 +2514,162 @@ sycl::event dgmm_batch(backend_selector selector, side *left_ri } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2740,12 +2677,11 @@ sycl::event gemm_batch(backend_selector selector, transpose *tr } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2753,12 +2689,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2766,13 +2701,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2780,13 +2714,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2794,12 +2727,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2808,10 +2741,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2820,10 +2753,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2832,10 +2765,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2843,109 +2776,105 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2953,9 +2882,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2963,10 +2892,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2974,10 +2903,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2985,10 +2914,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2996,10 +2925,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3007,9 +2935,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3017,11 +2945,11 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3029,11 +2957,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3041,11 +2969,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3053,11 +2981,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3065,45 +2993,44 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3111,9 +3038,9 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3121,10 +3048,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3132,10 +3059,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3143,11 +3070,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3155,11 +3081,10 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3167,11 +3092,11 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3179,62 +3104,57 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3242,28 +3162,28 @@ sycl::event trsm_batch(backend_selector selector, side *left_ri } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3271,10 +3191,10 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3282,27 +3202,27 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3310,10 +3230,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3321,10 +3240,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3332,10 +3251,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3343,45 +3262,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3389,9 +3306,9 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3399,10 +3316,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3410,10 +3327,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3421,43 +3338,43 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3465,9 +3382,9 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3475,10 +3392,10 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3486,267 +3403,257 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3754,9 +3661,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3764,9 +3671,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3774,182 +3681,176 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3957,64 +3858,63 @@ sycl::event her2k(backend_selector selector, uplo upper_lower, } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4022,10 +3922,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4034,9 +3934,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4045,9 +3945,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4055,18 +3955,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4074,9 +3974,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4084,9 +3984,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4094,10 +3994,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4106,10 +4006,10 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4118,11 +4018,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4131,11 +4031,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4143,115 +4043,115 @@ sycl::event omatadd_batch(backend_selector selector, transpose } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4259,9 +4159,9 @@ sycl::event omatadd(backend_selector selector, transpose transa } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4270,9 +4170,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4281,9 +4181,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4316,8 +4216,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4328,8 +4227,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4339,8 +4237,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); @@ -4350,8 +4247,7 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::cublas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx b/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx index 1141eb238..caa75a646 100644 --- a/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx +++ b/include/oneapi/mkl/blas/detail/cublas/onemkl_blas_cublas.hxx @@ -19,2314 +19,2211 @@ // Buffer APIs -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size); - -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy); +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); + +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, float c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, float c, float s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - double c, double s); +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, double c, + double s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, float c, float s); +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, float c, float s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, double c, double s); +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, double c, double s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param); -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param); -void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - -void scal(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx); - -void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer, 1>& x, + std::int64_t incx); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void sdsdot(sycl::queue& queue, std::int64_t n, float sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy); - -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); - -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); + +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); + +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); + +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); + +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, float beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, double beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size); - -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size); - -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size); - -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size); - -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); - -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); - -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy); +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size); -void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size); -void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size); -void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size); -void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); -void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); + +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + +void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); -void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); -void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); -void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); +void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); -void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); +void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); -void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a); -void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a); -void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); -void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, - std::int64_t lda); +void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda); -void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); +void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); +void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - float beta, sycl::buffer, 1> &c, std::int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - double beta, sycl::buffer, 1> &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::complex beta, sycl::buffer, 1> &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); - -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc); +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb); - -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb); - -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size); +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size); +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size); + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc); + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co); -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size); +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size); +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, float beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, double beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc); // USM APIs -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); - -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); - -sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies = {}); - -sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies = {}); - -sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies = {}); - -sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *result, - const std::vector &dependencies = {}); - -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, double *result, - const std::vector &dependencies = {}); - -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); + +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +sycl::event asum(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies = {}); + +sycl::event asum(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies = {}); + +sycl::event axpy(sycl::queue& queue, std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event axpy(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, float* alpha, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, double* alpha, const double** x, + std::int64_t* incx, double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event axpby(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event axpby(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event copy(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event copy(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const float** x, std::int64_t* incx, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, double c, - double s, const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, - float *c, std::complex *s, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, - double *c, std::complex *s, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float *param, - const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, - double *y, std::int64_t incy, double *param, - const std::vector &dependencies = {}); - -sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, - float *param, const std::vector &dependencies = {}); - -sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, - double *param, const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies = {}); - -sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, - double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stridea, - const float *x, std::int64_t incx, std::int64_t stridex, float beta, - float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stridea, - const double *x, std::int64_t incx, std::int64_t stridex, double beta, - double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float *beta, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double *beta, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const float **a, std::int64_t *lda, const float **x, - std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const double **a, std::int64_t *lda, const double **x, - std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, - double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *a, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *a, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *a, - const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *a, - const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, - double *a, const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, - double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, sycl::half beta, - sycl::half *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const std::complex *a, std::int64_t lda, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const std::complex *a, std::int64_t lda, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, - double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, - std::int64_t *n, std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, - std::int64_t *n, std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, float *c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double beta, double *c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, const float **b, std::int64_t *ldb, - float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, const double **b, std::int64_t *ldb, - double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha, - const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, sycl::half *beta, sycl::half **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, - std::int32_t **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, const float *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies = {}); + +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies = {}); + +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); + +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies = {}); + +sycl::event rot(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); + +sycl::event rot(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, double c, double s, + const std::vector& dependencies = {}); + +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies = {}); + +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies = {}); + +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies = {}); + +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies = {}); + +sycl::event rotm(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, float* param, + const std::vector& dependencies = {}); + +sycl::event rotm(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, double* param, + const std::vector& dependencies = {}); + +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, float* param, + const std::vector& dependencies = {}); + +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, double* param, + const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event sdsdot(sycl::queue& queue, std::int64_t n, float sb, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}); + +sycl::event swap(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event swap(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float beta, + float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double beta, + double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex beta, std::complex* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex beta, std::complex* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, const float** x, + std::int64_t* incx, float* beta, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, const double** x, + std::int64_t* incx, double* beta, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const float* a, std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const double* a, std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, std::int64_t* n, + const float** a, std::int64_t* lda, const float** x, std::int64_t* incx, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, std::int64_t* n, + const double** a, std::int64_t* lda, const double** x, std::int64_t* incx, + double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* a, + const float* x, std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* a, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, + const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, std::int64_t incy, double* a, + const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, std::int64_t incy, double* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, sycl::half beta, + sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, std::int64_t lda, + const bfloat16* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const std::complex* a, std::int64_t lda, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const std::complex* a, std::int64_t lda, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, float beta, + float* c, std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, double beta, + double* c, std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, float* alpha, const float** a, std::int64_t* lda, + float* beta, float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, double* alpha, const double** a, std::int64_t* lda, + double* beta, double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, double* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, std::int64_t* m, std::int64_t* n, float* alpha, + const float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, std::int64_t* m, std::int64_t* n, double* alpha, + const double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, const float** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, const double** b, std::int64_t* ldb, double* beta, + double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, sycl::half* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, sycl::half* beta, + sycl::half** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, + std::int32_t** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, std::int64_t ldb, + std::int64_t stride_b, double beta, double* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, sycl::half* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, - std::int64_t ldb, std::int64_t stride_b, float beta, std::int32_t *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, + std::int64_t ldb, std::int64_t stride_b, float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int8_t ao, - const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int8_t ao, - const std::int8_t *b, std::int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::int8_t *b, std::int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, + std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, float beta, std::int32_t* c, + std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::int8_t* b, std::int64_t ldb, std::int8_t bo, float beta, + std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, float beta, + std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, float *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, float beta, const float *b, - int64_t ldb, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, double beta, const double *b, - int64_t ldb, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, float* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, std::int64_t stridea, float* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, std::int64_t stridea, double* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, float beta, const float* b, + int64_t ldb, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, double beta, const double* b, + int64_t ldb, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, diff --git a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx b/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx index 1724bf5c7..38123485e 100644 --- a/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/mklcpu/blas_ct.hxx @@ -20,122 +20,121 @@ // Buffer APIs void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, + std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -144,10 +143,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - double beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -155,10 +154,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -167,11 +165,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -179,9 +176,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -190,9 +187,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -201,9 +198,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -212,9 +209,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -222,38 +219,38 @@ void gemm_batch(backend_selector selector, transpose transa, tr } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -261,10 +258,9 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); @@ -272,9 +268,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -283,249 +278,240 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -533,22 +519,21 @@ void gemv_batch(backend_selector selector, transpose trans, std } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); @@ -556,52 +541,49 @@ void gemv_batch(backend_selector selector, transpose trans, std void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size) { + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -609,277 +591,268 @@ void dgmm_batch(backend_selector selector, side left_right, std } void her(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a, - std::int64_t lda) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -887,7 +860,7 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -895,190 +868,187 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); @@ -1086,7 +1056,7 @@ void syr(backend_selector selector, uplo upper_lower, std::int6 void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1094,7 +1064,7 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1102,335 +1072,328 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklcpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1439,8 +1402,8 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1449,10 +1412,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1460,10 +1422,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1471,72 +1432,69 @@ void trsm_batch(backend_selector selector, side left_right, upl void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklcpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1544,8 +1502,8 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1553,22 +1511,22 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1576,7 +1534,7 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1584,16 +1542,16 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1601,9 +1559,9 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1612,9 +1570,9 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1623,10 +1581,10 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1634,123 +1592,123 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1758,253 +1716,250 @@ void omatadd(backend_selector selector, transpose transa, trans // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies) { + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -2012,11 +1967,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper_l } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2024,11 +1978,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2036,12 +1989,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2049,12 +2001,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2062,28 +2013,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2091,126 +2042,124 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, s } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const float* x, std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2218,9 +2167,9 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const double* x, std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2228,10 +2177,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2239,10 +2188,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2250,61 +2199,61 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2312,9 +2261,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2322,10 +2271,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2333,11 +2282,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2345,231 +2293,222 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2577,164 +2516,162 @@ sycl::event dgmm_batch(backend_selector selector, side *left_ri } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2742,12 +2679,11 @@ sycl::event gemm_batch(backend_selector selector, transpose *tr } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2755,12 +2691,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2768,13 +2703,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2782,13 +2716,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2796,12 +2729,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2810,10 +2743,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2822,10 +2755,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2834,10 +2767,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2845,109 +2778,105 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2955,9 +2884,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2965,10 +2894,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2976,10 +2905,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2987,10 +2916,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2998,10 +2927,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3009,9 +2937,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3019,11 +2947,11 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3031,11 +2959,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3043,11 +2971,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3055,11 +2983,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3067,45 +2995,44 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3113,9 +3040,9 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3123,10 +3050,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3134,10 +3061,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3145,11 +3072,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3157,11 +3083,10 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3169,11 +3094,11 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3181,62 +3106,57 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3244,28 +3164,28 @@ sycl::event trsm_batch(backend_selector selector, side *left_ri } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3273,10 +3193,10 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3284,27 +3204,27 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3312,10 +3232,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3323,10 +3242,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3334,10 +3253,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3345,45 +3264,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3391,9 +3308,9 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3401,10 +3318,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3412,10 +3329,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3423,43 +3340,43 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3467,9 +3384,9 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3477,10 +3394,10 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3488,267 +3405,257 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3756,9 +3663,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3766,9 +3673,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3776,182 +3683,176 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3959,64 +3860,63 @@ sycl::event her2k(backend_selector selector, uplo upper_lower, } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4024,10 +3924,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4036,9 +3936,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4047,9 +3947,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4057,18 +3957,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4076,9 +3976,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4086,9 +3986,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4096,10 +3996,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4108,10 +4008,10 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4120,11 +4020,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4133,11 +4033,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4145,115 +4045,115 @@ sycl::event omatadd_batch(backend_selector selector, transpose } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4261,9 +4161,9 @@ sycl::event omatadd(backend_selector selector, transpose transa } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4272,9 +4172,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4283,9 +4183,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4318,8 +4218,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4330,8 +4229,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4341,8 +4239,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); @@ -4352,8 +4249,7 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklcpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx b/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx index c69257e9c..bfad24ca2 100644 --- a/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/mklgpu/blas_ct.hxx @@ -20,122 +20,121 @@ // Buffer APIs void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, + std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -144,10 +143,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - double beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -155,10 +154,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -167,11 +165,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -179,9 +176,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -190,9 +187,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -201,9 +198,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -212,9 +209,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -222,38 +219,38 @@ void gemm_batch(backend_selector selector, transpose transa, tr } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -261,10 +258,9 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); @@ -272,9 +268,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -283,249 +278,240 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -533,22 +519,21 @@ void gemv_batch(backend_selector selector, transpose trans, std } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); @@ -556,52 +541,49 @@ void gemv_batch(backend_selector selector, transpose trans, std void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size) { + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -609,277 +591,268 @@ void dgmm_batch(backend_selector selector, side left_right, std } void her(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a, - std::int64_t lda) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -887,7 +860,7 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -895,190 +868,187 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); @@ -1086,7 +1056,7 @@ void syr(backend_selector selector, uplo upper_lower, std::int6 void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1094,7 +1064,7 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1102,335 +1072,328 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::mklgpu::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1439,8 +1402,8 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1449,10 +1412,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1460,10 +1422,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1471,72 +1432,69 @@ void trsm_batch(backend_selector selector, side left_right, upl void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::mklgpu::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1544,8 +1502,8 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1553,22 +1511,22 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1576,7 +1534,7 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1584,16 +1542,16 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1601,9 +1559,9 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1612,9 +1570,9 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1623,10 +1581,10 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1634,123 +1592,123 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1758,253 +1716,250 @@ void omatadd(backend_selector selector, transpose transa, trans // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies) { + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -2012,11 +1967,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper_l } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2024,11 +1978,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2036,12 +1989,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2049,12 +2001,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2062,28 +2013,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2091,126 +2042,124 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, s } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const float* x, std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2218,9 +2167,9 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const double* x, std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2228,10 +2177,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2239,10 +2188,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2250,61 +2199,61 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2312,9 +2261,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2322,10 +2271,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2333,11 +2282,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2345,231 +2293,222 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2577,164 +2516,162 @@ sycl::event dgmm_batch(backend_selector selector, side *left_ri } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2742,12 +2679,12 @@ sycl::event gemm_batch(backend_selector selector, transpose *tr } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2756,10 +2693,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2768,10 +2705,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2780,10 +2717,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2791,12 +2728,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2804,12 +2740,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2817,13 +2752,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2831,13 +2765,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2845,109 +2778,105 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2955,9 +2884,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2965,10 +2894,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2976,10 +2905,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2987,10 +2916,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2998,10 +2927,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3009,9 +2937,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3019,11 +2947,11 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3031,11 +2959,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3043,11 +2971,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3055,11 +2983,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3067,45 +2995,44 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3113,9 +3040,9 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3123,10 +3050,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3134,10 +3061,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3145,11 +3072,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3157,11 +3083,10 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3169,11 +3094,11 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3181,62 +3106,57 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3244,28 +3164,28 @@ sycl::event trsm_batch(backend_selector selector, side *left_ri } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3273,10 +3193,10 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3284,27 +3204,27 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3312,10 +3232,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3323,10 +3242,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3334,10 +3253,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3345,45 +3264,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3391,9 +3308,9 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3401,10 +3318,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3412,10 +3329,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3423,43 +3340,43 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3467,9 +3384,9 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3477,10 +3394,10 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3488,267 +3405,257 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3756,9 +3663,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3766,9 +3673,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3776,182 +3683,176 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3959,64 +3860,63 @@ sycl::event her2k(backend_selector selector, uplo upper_lower, } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4024,10 +3924,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4036,9 +3936,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4047,9 +3947,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4057,18 +3957,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4076,9 +3976,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4086,9 +3986,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4096,10 +3996,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4108,10 +4008,10 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4120,11 +4020,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4133,11 +4033,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4145,115 +4045,115 @@ sycl::event omatadd_batch(backend_selector selector, transpose } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4261,9 +4161,9 @@ sycl::event omatadd(backend_selector selector, transpose transa } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4272,9 +4172,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4283,9 +4183,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4318,8 +4218,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4330,8 +4229,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4341,8 +4239,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); @@ -4352,8 +4249,7 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::mklgpu::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx b/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx index 404d79ae0..4c94213fb 100644 --- a/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/netlib/blas_ct.hxx @@ -20,122 +20,121 @@ // Buffer APIs void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, + std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -144,10 +143,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - double beta, sycl::buffer &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -155,10 +154,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -167,11 +165,10 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); @@ -179,9 +176,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -190,9 +187,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -201,9 +198,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -212,9 +209,9 @@ void gemm_batch(backend_selector selector, transpose transa, tr void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, @@ -222,38 +219,38 @@ void gemm_batch(backend_selector selector, transpose transa, tr } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -261,10 +258,9 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); @@ -272,9 +268,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -283,249 +278,240 @@ void syrk_batch(backend_selector selector, uplo upper_lower, tr void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -533,22 +519,21 @@ void gemv_batch(backend_selector selector, transpose trans, std } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); @@ -556,52 +541,49 @@ void gemv_batch(backend_selector selector, transpose trans, std void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size) { + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -609,277 +591,268 @@ void dgmm_batch(backend_selector selector, side left_right, std } void her(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::netlib::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a, - std::int64_t lda) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -887,7 +860,7 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -895,190 +868,187 @@ void trsm(backend_selector selector, side left_right, uplo uppe void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); @@ -1086,7 +1056,7 @@ void syr(backend_selector selector, uplo upper_lower, std::int6 void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1094,7 +1064,7 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1102,335 +1072,328 @@ void trmm(backend_selector selector, side left_right, uplo uppe void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, - std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, - std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::netlib::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1439,8 +1402,8 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, @@ -1449,10 +1412,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1460,10 +1422,9 @@ void trsm_batch(backend_selector selector, side left_right, upl void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size) { + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1471,72 +1432,69 @@ void trsm_batch(backend_selector selector, side left_right, upl void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::netlib::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1544,8 +1502,8 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1553,22 +1511,22 @@ void omatcopy_batch(backend_selector selector, transpose trans, void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1576,7 +1534,7 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1584,16 +1542,16 @@ void imatcopy_batch(backend_selector selector, transpose trans, void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1601,9 +1559,9 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1612,9 +1570,9 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1623,10 +1581,10 @@ void omatadd_batch(backend_selector selector, transpose transa, void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::netlib::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, @@ -1634,123 +1592,123 @@ void omatadd_batch(backend_selector selector, transpose transa, } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::netlib::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1758,253 +1716,250 @@ void omatadd(backend_selector selector, transpose transa, trans // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(backend_selector selector, std::int64_t n, float alpha, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies) { + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -2012,11 +1967,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper_l } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2024,11 +1978,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2036,12 +1989,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2049,12 +2001,11 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2062,28 +2013,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_lo } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2091,126 +2042,124 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, s } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const float* x, std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2218,9 +2167,9 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const double* x, std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2228,10 +2177,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2239,10 +2188,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2250,61 +2199,61 @@ sycl::event axpy_batch(backend_selector selector, std::int64_t } sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2312,9 +2261,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2322,10 +2271,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2333,11 +2282,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2345,231 +2293,222 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, - std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2577,164 +2516,162 @@ sycl::event dgmm_batch(backend_selector selector, side *left_ri } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2742,12 +2679,11 @@ sycl::event gemm_batch(backend_selector selector, transpose *tr } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2755,12 +2691,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2768,13 +2703,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2782,13 +2716,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2796,12 +2729,12 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event gemm_batch(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2810,10 +2743,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2822,10 +2755,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2834,10 +2767,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tra sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2845,109 +2778,105 @@ sycl::event gemm_batch(backend_selector selector, transpose tra } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2955,9 +2884,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2965,10 +2894,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2976,10 +2905,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2987,10 +2916,10 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2998,10 +2927,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3009,9 +2937,9 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3019,11 +2947,11 @@ sycl::event gemm(backend_selector selector, transpose transa, t } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3031,11 +2959,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3043,11 +2971,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3055,11 +2983,11 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event gemm_bias(backend_selector selector, transpose transa, - transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + transpose transb, offset offsetc, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3067,45 +2995,44 @@ sycl::event gemm_bias(backend_selector selector, transpose tran } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3113,9 +3040,9 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3123,10 +3050,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3134,10 +3061,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3145,11 +3072,10 @@ sycl::event trsm(backend_selector selector, side left_right, up } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3157,11 +3083,10 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3169,11 +3094,11 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3181,62 +3106,57 @@ sycl::event trsm_batch(backend_selector selector, side left_rig } sycl::event trsm_batch(backend_selector selector, side left_right, - uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3244,28 +3164,28 @@ sycl::event trsm_batch(backend_selector selector, side *left_ri } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3273,10 +3193,10 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3284,27 +3204,27 @@ sycl::event hemm(backend_selector selector, side left_right, up } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3312,10 +3232,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3323,10 +3242,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3334,10 +3253,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3345,45 +3264,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, st } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3391,9 +3308,9 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3401,10 +3318,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3412,10 +3329,10 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3423,43 +3340,43 @@ sycl::event symm(backend_selector selector, side left_right, up } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3467,9 +3384,9 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3477,10 +3394,10 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3488,267 +3405,257 @@ sycl::event trmm(backend_selector selector, side left_right, up } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event gemmt(backend_selector selector, uplo upper_lower, - transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3756,9 +3663,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3766,9 +3673,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3776,182 +3683,176 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, s } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, - std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3959,64 +3860,63 @@ sycl::event her2k(backend_selector selector, uplo upper_lower, } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4024,10 +3924,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4036,10 +3936,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4048,10 +3947,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -4059,19 +3957,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpose } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, - const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4079,9 +3976,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4089,9 +3986,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4099,10 +3996,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4111,10 +4008,10 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4123,12 +4020,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4137,12 +4033,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4150,115 +4045,115 @@ sycl::event omatadd_batch(backend_selector selector, transpose } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4266,9 +4161,9 @@ sycl::event omatadd(backend_selector selector, transpose transa } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4277,9 +4172,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4288,9 +4183,9 @@ sycl::event omatadd(backend_selector selector, transpose transa sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4323,8 +4218,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4335,8 +4229,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4346,8 +4239,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); @@ -4357,8 +4249,7 @@ sycl::event imatcopy_batch(backend_selector selector, transpose sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::netlib::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx b/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx index fbb64a6a0..ef0db5b09 100644 --- a/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx +++ b/include/oneapi/mkl/blas/detail/onemkl_blas_backends.hxx @@ -19,2879 +19,2786 @@ // Buffer APIs -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - double beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::half beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t k, sycl::half alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::half beta, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); -ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc); -ONEMKL_EXPORT void symm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void hemm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void hemm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, + std::int64_t ldc); -ONEMKL_EXPORT void hemm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void hemm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &c, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &c, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, + std::int64_t ldc); -ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, float beta, sycl::buffer, 1> &c, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, + std::int64_t ldb, float beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, - std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, + std::int64_t ldb, double beta, sycl::buffer, 1>& c, + std::int64_t ldc); -ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb); -ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb); -ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void trmm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb); -ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb); -ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void trsm(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &x, +ONEMKL_EXPORT void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size); + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, double beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size); +ONEMKL_EXPORT void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, +ONEMKL_EXPORT void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, +ONEMKL_EXPORT void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, +ONEMKL_EXPORT void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy); -ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy); -ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -ONEMKL_EXPORT void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + sycl::buffer, 1>& y, std::int64_t incy); + +ONEMKL_EXPORT void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); + +ONEMKL_EXPORT void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); + +ONEMKL_EXPORT void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +ONEMKL_EXPORT void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +ONEMKL_EXPORT void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +ONEMKL_EXPORT void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, +ONEMKL_EXPORT void hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, +ONEMKL_EXPORT void hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +ONEMKL_EXPORT void hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, +ONEMKL_EXPORT void hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); +ONEMKL_EXPORT void her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, - std::int64_t lda); +ONEMKL_EXPORT void her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +ONEMKL_EXPORT void her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, +ONEMKL_EXPORT void her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); -ONEMKL_EXPORT void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +ONEMKL_EXPORT void hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +ONEMKL_EXPORT void hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); - -ONEMKL_EXPORT void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); - -ONEMKL_EXPORT void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a); - -ONEMKL_EXPORT void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a); - -ONEMKL_EXPORT void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - -ONEMKL_EXPORT void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - -ONEMKL_EXPORT void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, +ONEMKL_EXPORT void hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); + +ONEMKL_EXPORT void hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); + +ONEMKL_EXPORT void hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a); + +ONEMKL_EXPORT void hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a); + +ONEMKL_EXPORT void sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + +ONEMKL_EXPORT void syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); + +ONEMKL_EXPORT void syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); + +ONEMKL_EXPORT void syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); + +ONEMKL_EXPORT void syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a, + std::int64_t lda); + +ONEMKL_EXPORT void spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, + std::int64_t incy); -ONEMKL_EXPORT void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a); +ONEMKL_EXPORT void spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); -ONEMKL_EXPORT void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a); +ONEMKL_EXPORT void spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); -ONEMKL_EXPORT void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); +ONEMKL_EXPORT void spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); -ONEMKL_EXPORT void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); +ONEMKL_EXPORT void spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& a); -ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx); + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx); + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx); + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, - std::int64_t incx); + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, - std::int64_t incx); + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx); -ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void dotc(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +ONEMKL_EXPORT void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& result); -ONEMKL_EXPORT void dotc(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +ONEMKL_EXPORT void dotc(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -ONEMKL_EXPORT void dotu(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +ONEMKL_EXPORT void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& result); -ONEMKL_EXPORT void dotu(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +ONEMKL_EXPORT void dotu(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void iamax(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void iamax(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void iamax(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void iamin(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void iamin(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void iamin(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void asum(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void axpy(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void axpy(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy); -ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, - std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void axpby(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, + std::int64_t incy); -ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void axpby(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +ONEMKL_EXPORT void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, +ONEMKL_EXPORT void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy); -ONEMKL_EXPORT void copy(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void copy(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size); +ONEMKL_EXPORT void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, +ONEMKL_EXPORT void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void copy_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void copy_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &y, +ONEMKL_EXPORT void copy_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +ONEMKL_EXPORT void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +ONEMKL_EXPORT void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -ONEMKL_EXPORT void sdsdot(sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +ONEMKL_EXPORT void sdsdot(sycl::queue& queue, std::int64_t n, float sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -ONEMKL_EXPORT void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +ONEMKL_EXPORT void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); +ONEMKL_EXPORT void nrm2(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); -ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); +ONEMKL_EXPORT void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); -ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, - float s); +ONEMKL_EXPORT void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, float c, float s); -ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, - double s); +ONEMKL_EXPORT void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, double c, double s); -ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, +ONEMKL_EXPORT void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s); -ONEMKL_EXPORT void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - double c, double s); +ONEMKL_EXPORT void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, + double s); -ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); +ONEMKL_EXPORT void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); +ONEMKL_EXPORT void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +ONEMKL_EXPORT void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -ONEMKL_EXPORT void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); +ONEMKL_EXPORT void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -ONEMKL_EXPORT void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); +ONEMKL_EXPORT void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param); -ONEMKL_EXPORT void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); +ONEMKL_EXPORT void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param); -ONEMKL_EXPORT void rotmg(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m); +ONEMKL_EXPORT void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param); -ONEMKL_EXPORT void rotmg(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, - double y1, sycl::buffer ¶m); +ONEMKL_EXPORT void rotmg(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); +ONEMKL_EXPORT void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx); -ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +ONEMKL_EXPORT void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy); -ONEMKL_EXPORT void swap(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +ONEMKL_EXPORT void swap(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, double beta, sycl::buffer &c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); + std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, sycl::buffer &a, + std::int64_t k, sycl::half alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, + sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, + sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, + sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +ONEMKL_EXPORT void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + double beta, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +ONEMKL_EXPORT void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, - float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, - float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, - float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -ONEMKL_EXPORT void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, - float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, +ONEMKL_EXPORT void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, +ONEMKL_EXPORT void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, +ONEMKL_EXPORT void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, +ONEMKL_EXPORT void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); +ONEMKL_EXPORT void omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); +ONEMKL_EXPORT void omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &b, +ONEMKL_EXPORT void omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer &b, +ONEMKL_EXPORT void omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, +ONEMKL_EXPORT void imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, +ONEMKL_EXPORT void imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT void imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + float alpha, sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, + double alpha, sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); -ONEMKL_EXPORT void omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT void omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // USM APIs -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const bfloat16 *a, std::int64_t lda, - const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, - const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, - const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, double beta, - double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, - std::complex *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, const std::complex *a, std::int64_t lda, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, const std::complex *a, std::int64_t lda, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, - std::int64_t ldb, double beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const bfloat16* a, std::int64_t lda, + const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::int8_t* a, std::int64_t lda, std::int8_t ao, + const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::int8_t* a, std::int64_t lda, std::int8_t ao, + const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, double beta, + double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, double* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsm_batch( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stridea, - const float *x, std::int64_t incx, std::int64_t stridex, - float beta, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, + std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const std::complex* a, std::int64_t lda, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const std::complex* a, std::int64_t lda, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float beta, float* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stridea, - const double *x, std::int64_t incx, std::int64_t stridex, - double beta, double *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double beta, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemv_batch( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemv_batch( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, - const float **a, std::int64_t *lda, const float **x, - std::int64_t *incx, float *beta, float **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, - const double **a, std::int64_t *lda, const double **x, - std::int64_t *incx, double *beta, double **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, + const float** a, std::int64_t* lda, const float** x, + std::int64_t* incx, float* beta, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, + const double** a, std::int64_t* lda, const double** x, + std::int64_t* incx, double* beta, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + std::int64_t m, std::int64_t n, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemv_batch( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - std::int64_t m, std::int64_t n, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float *c, - std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + std::int64_t m, std::int64_t n, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, + std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, + std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, + double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, + std::int64_t lda, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, + std::int64_t lda, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* a, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* a, const double* x, std::int64_t incx, + double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamax(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamax(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamin(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamin(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event asum(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event asum(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - std::int64_t m, std::int64_t n, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double *c, - std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, - std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, - std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, - double **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, std::int64_t stridex, + double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const std::complex *x, - std::int64_t incx, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const std::complex *x, - std::int64_t incx, std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpby(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpby(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, + std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const double** x, + std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, - std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const std::complex *x, - std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const std::complex *x, - std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *a, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - double *a, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, - std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotc(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotc(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotu(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dotu(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamax(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event iamin(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, - const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, - const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, - std::complex *alpha, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, - float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event copy_batch(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, - double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, - std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, - double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event axpby(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event dot(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sdsdot(sycl::queue& queue, std::int64_t n, float sb, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event copy_batch(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, - std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - float c, float s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - double c, double s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, - double s, const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, - double *s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotg(sycl::queue &queue, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, - double y1, double *param, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, - std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, sycl::half *alpha, - const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float *alpha, - const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, - std::int32_t **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double beta, double *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event nrm2(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event nrm2(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, float c, + float s, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + double c, double s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rot(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rot(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, + float* c, std::complex* s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, + double* c, std::complex* s, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotm(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, float* param, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotm(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, double* param, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, + float* param, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, + double* param, const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, float* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, double* x, + std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event swap(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, + float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event swap(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, + double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, sycl::half* alpha, + const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float* alpha, + const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, + std::int32_t** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, + std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemm_batch( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event gemm_batch( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, + oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, sycl::half beta, + sycl::half* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, - std::int64_t ldb, std::int64_t stride_b, float beta, float *c, + std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, + std::int64_t ldb, std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, + std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, - std::int64_t ldc, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, + float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, + double* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t n, std::int64_t k, std::complex alpha, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, + std::int64_t ldc, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, + const float* a, std::int64_t lda, std::int64_t stride_a, + float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, + const double* a, std::int64_t lda, std::int64_t stride_a, + double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, +ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, +ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, float alpha, const float *a, + std::int64_t n, float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, + const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, double alpha, const double *a, + std::int64_t n, double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, - double *c, std::int64_t ldc, std::int64_t stride_c, + const double* b, std::int64_t ldb, std::int64_t stride_b, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatadd_batch( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex *c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, const std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatadd_batch( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, std::complex *c, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - float *b, std::int64_t ldb, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - double *b, std::int64_t ldb, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + double* b, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT sycl::event omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT sycl::event omatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, float *b, +ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, double *b, +ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT sycl::event omatcopy2(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, +ONEMKL_EXPORT sycl::event imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, +ONEMKL_EXPORT sycl::event imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT sycl::event imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event imatcopy(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +ONEMKL_EXPORT sycl::event imatcopy(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, float beta, - const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + float alpha, const float* a, std::int64_t lda, float beta, + const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, double beta, - const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); + double alpha, const double* a, std::int64_t lda, double beta, + const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event omatadd(sycl::queue &queue, oneapi::mkl::transpose transa, +ONEMKL_EXPORT sycl::event omatadd(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, const float** a, diff --git a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx index 8a66ed707..6f56157ba 100644 --- a/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/portblas/blas_ct.hxx @@ -20,123 +20,123 @@ // Buffer APIs void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1> &a, - std::int64_t lda, double beta, sycl::buffer, 1> &c, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -145,9 +145,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -156,9 +156,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -167,9 +167,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -178,9 +178,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -189,9 +189,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -200,9 +200,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -211,9 +211,9 @@ void gemm_batch(backend_selector selector, transpose transa, void gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -221,38 +221,38 @@ void gemm_batch(backend_selector selector, transpose transa, } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -260,8 +260,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -270,8 +270,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -280,8 +280,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -289,236 +289,236 @@ void syrk_batch(backend_selector selector, uplo upper_lower, } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, @@ -526,9 +526,9 @@ void gemv_batch(backend_selector selector, transpose trans, s } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, std::int64_t incy, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, @@ -536,10 +536,10 @@ void gemv_batch(backend_selector selector, transpose trans, s } void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, @@ -548,9 +548,9 @@ void gemv_batch(backend_selector selector, transpose trans, s void gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, @@ -558,9 +558,9 @@ void gemv_batch(backend_selector selector, transpose trans, s } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -568,9 +568,9 @@ void dgmm_batch(backend_selector selector, side left_right, s } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -578,9 +578,9 @@ void dgmm_batch(backend_selector selector, side left_right, s } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -588,9 +588,9 @@ void dgmm_batch(backend_selector selector, side left_right, s } void dgmm_batch(backend_selector selector, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -598,90 +598,90 @@ void dgmm_batch(backend_selector selector, side left_right, s } void her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void iamin(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { + float alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -689,9 +689,9 @@ void gemm_bias(backend_selector selector, transpose transa, t void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -699,9 +699,9 @@ void gemm_bias(backend_selector selector, transpose transa, t void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -709,163 +709,163 @@ void gemm_bias(backend_selector selector, transpose transa, t void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { oneapi::mkl::blas::portblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -873,7 +873,7 @@ void trsm(backend_selector selector, side left_right, uplo up void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -881,187 +881,187 @@ void trsm(backend_selector selector, side left_right, uplo up void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); @@ -1069,7 +1069,7 @@ void syr(backend_selector selector, uplo upper_lower, std::in void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1077,7 +1077,7 @@ void trmm(backend_selector selector, side left_right, uplo up void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); @@ -1085,328 +1085,328 @@ void trmm(backend_selector selector, side left_right, uplo up void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { oneapi::mkl::blas::portblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void iamax(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(backend_selector selector, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1415,8 +1415,8 @@ void trsm_batch(backend_selector selector, side left_right, u void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1425,8 +1425,8 @@ void trsm_batch(backend_selector selector, side left_right, u void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1435,8 +1435,8 @@ void trsm_batch(backend_selector selector, side left_right, u void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1445,69 +1445,69 @@ void trsm_batch(backend_selector selector, side left_right, u void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { oneapi::mkl::blas::portblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1515,8 +1515,8 @@ void omatcopy_batch(backend_selector selector, transpose tran void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1524,22 +1524,22 @@ void omatcopy_batch(backend_selector selector, transpose tran void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1547,7 +1547,7 @@ void imatcopy_batch(backend_selector selector, transpose tran void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1555,16 +1555,16 @@ void imatcopy_batch(backend_selector selector, transpose tran void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1572,9 +1572,9 @@ void omatadd_batch(backend_selector selector, transpose trans } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1583,9 +1583,9 @@ void omatadd_batch(backend_selector selector, transpose trans void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1594,10 +1594,10 @@ void omatadd_batch(backend_selector selector, transpose trans void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::portblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1605,123 +1605,123 @@ void omatadd_batch(backend_selector selector, transpose trans } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::portblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1729,185 +1729,185 @@ void omatadd(backend_selector selector, transpose transa, tra // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2( selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2( selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - float *x, std::int64_t incx, const std::vector &dependencies) { + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - double *x, std::int64_t incx, const std::vector &dependencies) { + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpmv( selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpmv( selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; @@ -1915,9 +1915,9 @@ sycl::event syrk(backend_selector selector, uplo upper_lower, sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; @@ -1925,54 +1925,54 @@ sycl::event syrk(backend_selector selector, uplo upper_lower, sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -1980,10 +1980,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, float *c, + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -1992,9 +1992,9 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_ sycl::event syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2003,10 +2003,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_ sycl::event syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2015,10 +2015,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_ sycl::event syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2026,28 +2026,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_ } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her2( selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her2( selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2055,10 +2055,10 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, } sycl::event hbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -2066,114 +2066,114 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, } sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, float c, float s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event rot(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, + std::complex* x, std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, float *alpha, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, float* alpha, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, double *alpha, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, double* alpha, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2181,9 +2181,9 @@ sycl::event axpy_batch(backend_selector selector, std::int64_ } sycl::event axpy_batch(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2191,10 +2191,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_ } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2202,10 +2202,10 @@ sycl::event axpy_batch(backend_selector selector, std::int64_ } sycl::event axpy_batch(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2213,61 +2213,61 @@ sycl::event axpy_batch(backend_selector selector, std::int64_ } sycl::event axpby(backend_selector selector, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2275,9 +2275,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2286,9 +2286,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2297,9 +2297,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2307,49 +2307,49 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, std::int64_t incy, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2357,11 +2357,11 @@ sycl::event gemv_batch(backend_selector selector, transpose t } sycl::event gemv_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, std::int64_t incy, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2370,11 +2370,11 @@ sycl::event gemv_batch(backend_selector selector, transpose t sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2383,58 +2383,58 @@ sycl::event gemv_batch(backend_selector selector, transpose t sycl::event gemv_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); @@ -2442,10 +2442,10 @@ sycl::event gemv_batch(backend_selector selector, transpose * } sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2453,10 +2453,10 @@ sycl::event dgmm_batch(backend_selector selector, side left_r } sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2464,11 +2464,11 @@ sycl::event dgmm_batch(backend_selector selector, side left_r } sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2476,55 +2476,55 @@ sycl::event dgmm_batch(backend_selector selector, side left_r } sycl::event dgmm_batch(backend_selector selector, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2532,162 +2532,162 @@ sycl::event dgmm_batch(backend_selector selector, side *left_ } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **b, std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2696,11 +2696,11 @@ sycl::event gemm_batch(backend_selector selector, transpose * sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2709,10 +2709,10 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2721,10 +2721,10 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2733,10 +2733,10 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2745,10 +2745,10 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2757,10 +2757,10 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2769,11 +2769,11 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2782,11 +2782,11 @@ sycl::event gemm_batch(backend_selector selector, transpose t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2794,105 +2794,105 @@ sycl::event gemm_batch(backend_selector selector, transpose t } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, const std::vector &dependencies) { + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spmv( selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, const std::vector &dependencies) { + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spmv( selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event swap(backend_selector selector, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2900,9 +2900,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2911,9 +2911,9 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2922,9 +2922,9 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2933,9 +2933,9 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2943,9 +2943,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2953,9 +2953,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2964,10 +2964,10 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2976,10 +2976,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tr sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2988,10 +2988,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tr sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3000,10 +3000,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tr sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -3011,35 +3011,35 @@ sycl::event gemm_bias(backend_selector selector, transpose tr } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; @@ -3047,8 +3047,8 @@ sycl::event ger(backend_selector selector, std::int64_t m, st sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3057,8 +3057,8 @@ sycl::event trsm(backend_selector selector, side left_right, sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3067,9 +3067,9 @@ sycl::event trsm(backend_selector selector, side left_right, sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3078,9 +3078,9 @@ sycl::event trsm(backend_selector selector, side left_right, sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3089,9 +3089,9 @@ sycl::event trsm(backend_selector selector, side left_right, sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3100,9 +3100,9 @@ sycl::event trsm_batch(backend_selector selector, side left_r sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3111,10 +3111,10 @@ sycl::event trsm_batch(backend_selector selector, side left_r sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3123,56 +3123,56 @@ sycl::event trsm_batch(backend_selector selector, side left_r sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3180,18 +3180,18 @@ sycl::event trsm_batch(backend_selector selector, side *left_ } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; @@ -3199,9 +3199,9 @@ sycl::event dotu(backend_selector selector, std::int64_t n, sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3210,9 +3210,9 @@ sycl::event hemm(backend_selector selector, side left_right, sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3220,27 +3220,27 @@ sycl::event hemm(backend_selector selector, side left_right, } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3248,9 +3248,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, } sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3259,9 +3259,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3270,9 +3270,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, sycl::event gbmv(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3280,43 +3280,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3324,9 +3324,9 @@ sycl::event symm(backend_selector selector, side left_right, } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3335,9 +3335,9 @@ sycl::event symm(backend_selector selector, side left_right, sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3346,9 +3346,9 @@ sycl::event symm(backend_selector selector, side left_right, sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3356,34 +3356,34 @@ sycl::event symm(backend_selector selector, side left_right, } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; @@ -3391,8 +3391,8 @@ sycl::event syr(backend_selector selector, uplo upper_lower, sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3401,8 +3401,8 @@ sycl::event trmm(backend_selector selector, side left_right, sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3411,9 +3411,9 @@ sycl::event trmm(backend_selector selector, side left_right, sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3422,225 +3422,225 @@ sycl::event trmm(backend_selector selector, side left_right, sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, double *x1, - double y1, double *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy, +sycl::event copy_batch(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { + transpose transb, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3648,9 +3648,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower } sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { + transpose transb, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3659,9 +3659,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3670,9 +3670,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3680,9 +3680,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3690,9 +3690,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3700,166 +3700,166 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event asum(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, const std::vector &dependencies) { + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, const std::vector &dependencies) { + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, + float* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } sycl::event sdsdot(backend_selector selector, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; @@ -3867,9 +3867,9 @@ sycl::event sdsdot(backend_selector selector, std::int64_t n, sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3878,62 +3878,62 @@ sycl::event her2k(backend_selector selector, uplo upper_lower sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3941,10 +3941,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpo } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3953,9 +3953,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpo sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3964,9 +3964,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpo sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3974,18 +3974,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpo } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -3993,9 +3993,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpo sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4003,9 +4003,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpo sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -4013,10 +4013,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpo sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4025,10 +4025,10 @@ sycl::event omatadd_batch(backend_selector selector, transpos sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4037,11 +4037,11 @@ sycl::event omatadd_batch(backend_selector selector, transpos sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4050,11 +4050,11 @@ sycl::event omatadd_batch(backend_selector selector, transpos sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -4062,115 +4062,115 @@ sycl::event omatadd_batch(backend_selector selector, transpos } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c, - std::int64_t ldc, const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, std::int64_t ldb, float* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4178,9 +4178,9 @@ sycl::event omatadd(backend_selector selector, transpose tran } sycl::event omatadd(backend_selector selector, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c, - std::int64_t ldc, const std::vector &dependencies) { + transpose transb, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, std::int64_t ldb, double* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4189,9 +4189,9 @@ sycl::event omatadd(backend_selector selector, transpose tran sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4200,95 +4200,95 @@ sycl::event omatadd(backend_selector selector, transpose tran sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); return done; } -sycl::event omatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); return done; } -sycl::event omatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); return done; } -sycl::event omatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); return done; } -sycl::event omatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); return done; } -sycl::event imatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, float **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, float** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); return done; } -sycl::event imatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, double **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, double** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); return done; } -sycl::event imatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); return done; } -sycl::event imatcopy_batch(backend_selector selector, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(backend_selector selector, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::portblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx b/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx index bc86929b0..7410315d2 100644 --- a/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx +++ b/include/oneapi/mkl/blas/detail/rocblas/blas_ct.hxx @@ -20,119 +20,119 @@ **************************************************************************/ void herk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, float alpha, sycl::buffer, 1> &a, int64_t lda, float beta, - sycl::buffer, 1> &c, int64_t ldc) { + int64_t k, float alpha, sycl::buffer, 1>& a, int64_t lda, float beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void herk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, double alpha, sycl::buffer, 1> &a, int64_t lda, - double beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t k, double alpha, sycl::buffer, 1>& a, int64_t lda, + double beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::herk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void scal(backend_selector selector, int64_t n, float alpha, - sycl::buffer &x, int64_t incx) { + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, int64_t n, double alpha, - sycl::buffer &x, int64_t incx) { + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx) { + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx) { + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx) { + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void scal(backend_selector selector, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx) { + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void spr(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { + sycl::buffer& x, int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void spr(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { + sycl::buffer& x, int64_t incx, sycl::buffer& a) { oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, float beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -140,9 +140,9 @@ void gemm_batch(backend_selector selector, transpose transa, t } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, double beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -151,9 +151,9 @@ void gemm_batch(backend_selector selector, transpose transa, t void gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -162,9 +162,9 @@ void gemm_batch(backend_selector selector, transpose transa, t void gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -172,9 +172,9 @@ void gemm_batch(backend_selector selector, transpose transa, t } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, sycl::half beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, sycl::half beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -182,9 +182,9 @@ void gemm_batch(backend_selector selector, transpose transa, t } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, float beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -192,9 +192,9 @@ void gemm_batch(backend_selector selector, transpose transa, t } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, float beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -202,9 +202,9 @@ void gemm_batch(backend_selector selector, transpose transa, t } void gemm_batch(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, - int64_t stride_b, float beta, sycl::buffer &c, int64_t ldc, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, + int64_t stride_b, float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemm_batch(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, @@ -212,38 +212,38 @@ void gemm_batch(backend_selector selector, transpose transa, t } void syrk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, float beta, - sycl::buffer &c, int64_t ldc) { + int64_t k, float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, double beta, - sycl::buffer &c, int64_t ldc) { + int64_t k, double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, std::complex beta, sycl::buffer, 1> &c, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, std::complex beta, sycl::buffer, 1> &c, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syrk(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, float beta, sycl::buffer &c, int64_t ldc, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -251,8 +251,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, t } void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, double alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, double beta, sycl::buffer &c, int64_t ldc, + int64_t n, int64_t k, double alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -261,8 +261,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, t void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -271,8 +271,8 @@ void syrk_batch(backend_selector selector, uplo upper_lower, t void syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::syrk_batch(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, @@ -280,247 +280,247 @@ void syrk_batch(backend_selector selector, uplo upper_lower, t } void her2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void her2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void hbmv(backend_selector selector, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void hbmv(backend_selector selector, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void rot(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, float c, float s) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, float c, float s) { oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void rot(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, double c, double s) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, double c, double s) { oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, float c, float s) { +void rot(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, float c, float s) { oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } -void rot(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, double c, double s) { +void rot(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, double c, double s) { oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s); } void axpy(backend_selector selector, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy); } void axpy_batch(backend_selector selector, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, int64_t stridex, sycl::buffer &y, + sycl::buffer& x, int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, int64_t stridex, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { + sycl::buffer& x, int64_t incx, int64_t stridex, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpy_batch(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } void axpby(backend_selector selector, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, float beta, sycl::buffer &y, + sycl::buffer& x, int64_t incx, float beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, double beta, sycl::buffer &y, + sycl::buffer& x, int64_t incx, double beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void axpby(backend_selector selector, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy); } void sdsdot(backend_selector selector, int64_t n, float sb, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &result) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result); } void gerc(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void gerc(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, + int64_t n, int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2k(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, float beta, sycl::buffer &y, int64_t incy) { + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, float beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, double beta, sycl::buffer &y, int64_t incy) { + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, double beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } void gemv_batch(backend_selector selector, transpose trans, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, float beta, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { + float alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, float beta, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, double beta, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { + double alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, double beta, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } void gemv_batch(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stridea, sycl::buffer, 1> &x, int64_t incx, - int64_t stridex, std::complex beta, sycl::buffer, 1> &y, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stridea, sycl::buffer, 1>& x, int64_t incx, + int64_t stridex, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -528,10 +528,10 @@ void gemv_batch(backend_selector selector, transpose trans, in } void gemv_batch(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stridea, sycl::buffer, 1> &x, int64_t incx, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stridea, sycl::buffer, 1>& x, int64_t incx, int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::gemv_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, @@ -539,8 +539,8 @@ void gemv_batch(backend_selector selector, transpose trans, in } void dgmm_batch(backend_selector selector, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &c, int64_t ldc, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -548,18 +548,18 @@ void dgmm_batch(backend_selector selector, side left_right, in } void dgmm_batch(backend_selector selector, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size) { + sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } void dgmm_batch(backend_selector selector, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stridea, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &c, int64_t ldc, int64_t stridec, + sycl::buffer, 1>& a, int64_t lda, int64_t stridea, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& c, int64_t ldc, int64_t stridec, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -567,9 +567,9 @@ void dgmm_batch(backend_selector selector, side left_right, in } void dgmm_batch(backend_selector selector, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stridea, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &c, int64_t ldc, int64_t stridec, + sycl::buffer, 1>& a, int64_t lda, int64_t stridea, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& c, int64_t ldc, int64_t stridec, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch(selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, @@ -577,88 +577,88 @@ void dgmm_batch(backend_selector selector, side left_right, in } void her(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void her(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void hpr(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } void hpr(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a) { oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a); } -void iamin(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void iamin(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } -void iamin(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void iamin(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void iamin(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result); } void hpmv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void hpmv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, float beta, - sycl::buffer &y, int64_t incy) { + sycl::buffer& a, sycl::buffer& x, int64_t incx, float beta, + sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void spmv(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, double beta, - sycl::buffer &y, int64_t incy) { + sycl::buffer& a, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy); } void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, int8_t ao, sycl::buffer &b, - int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, int64_t lda, int8_t ao, sycl::buffer& b, + int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, int64_t ldc, + sycl::buffer& co) { oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -666,9 +666,9 @@ void gemm_bias(backend_selector selector, transpose transa, tr void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, int8_t ao, sycl::buffer &b, - int64_t ldb, int8_t bo, float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, int64_t lda, int8_t ao, sycl::buffer& b, + int64_t ldb, int8_t bo, float beta, sycl::buffer& c, int64_t ldc, + sycl::buffer& co) { oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -676,9 +676,9 @@ void gemm_bias(backend_selector selector, transpose transa, tr void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, uint8_t ao, sycl::buffer &b, - int64_t ldb, int8_t bo, float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, int64_t lda, uint8_t ao, sycl::buffer& b, + int64_t ldb, int8_t bo, float beta, sycl::buffer& c, int64_t ldc, + sycl::buffer& co) { oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); @@ -686,83 +686,83 @@ void gemm_bias(backend_selector selector, transpose transa, tr void gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, uint8_t ao, sycl::buffer &b, - int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { + sycl::buffer& a, int64_t lda, uint8_t ao, sycl::buffer& b, + int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, int64_t ldc, + sycl::buffer& co) { oneapi::mkl::blas::rocblas::MAJOR::gemm_bias(selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void swap(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +void swap(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } -void swap(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +void swap(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void swap(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy); } void geru(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void geru(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void nrm2(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void nrm2(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } -void nrm2(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void nrm2(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result); } void gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, + int64_t m, int64_t n, int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -770,161 +770,161 @@ void gemm(backend_selector selector, transpose transa, transpo void gemm(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, sycl::half beta, - sycl::buffer &c, int64_t ldc) { + int64_t m, int64_t n, int64_t k, sycl::half alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, sycl::half beta, + sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, + int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void syr2(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void syr2(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, int64_t m, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void ger(backend_selector selector, int64_t m, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void dotu(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void dotu(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result); } void hemm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hemm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void hpr2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void hpr2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a) { + std::complex alpha, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a) { oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx, float beta, sycl::buffer &y, + int64_t kl, int64_t ku, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx, float beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx, double beta, sycl::buffer &y, + int64_t kl, int64_t ku, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx, double beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); @@ -932,433 +932,433 @@ void gbmv(backend_selector selector, transpose trans, int64_t void gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbmv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void symm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { + int64_t n, float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, + int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void symm(backend_selector selector, side left_right, uplo upper_lower, int64_t m, - int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } void dotc(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void dotc(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result); } void syr(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void syr(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } void trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } -void rotmg(backend_selector selector, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(backend_selector selector, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, sycl::buffer &x, + diag unit_diag, int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } void trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::trsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void copy(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +void copy(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +void copy(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } void copy(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy); } -void copy_batch(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void copy_batch(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void copy_batch(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void copy_batch(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::copy_batch(selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size); } void hemv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void hemv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::hemv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, float beta, - sycl::buffer &c, int64_t ldc) { + transpose transb, int64_t n, int64_t k, float alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, float beta, + sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, int64_t n, int64_t k, double alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, double beta, - sycl::buffer &c, int64_t ldc) { + transpose transb, int64_t n, int64_t k, double alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, double beta, + sycl::buffer& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void asum(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void asum(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } -void asum(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void asum(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result); } void sbmv(backend_selector selector, uplo upper_lower, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, float beta, sycl::buffer &y, int64_t incy) { + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, float beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void sbmv(backend_selector selector, uplo upper_lower, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, double beta, sycl::buffer &y, int64_t incy) { + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, double beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &x, int64_t incx) { + diag unit_diag, int64_t n, int64_t k, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& x, int64_t incx) { oneapi::mkl::blas::rocblas::MAJOR::tbsv(selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } void spr2(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } void spr2(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a); } -void iamax(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void iamax(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void iamax(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &result) { +void iamax(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } void iamax(backend_selector selector, int64_t n, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result); } -void rotm(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& param) { oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void rotm(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { +void rotm(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& param) { oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param); } -void dot(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } -void dot(backend_selector selector, int64_t n, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(backend_selector selector, int64_t n, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& result) { oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result); } void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1367,8 +1367,8 @@ void trsm_batch(backend_selector selector, side left_right, up void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1376,8 +1376,8 @@ void trsm_batch(backend_selector selector, side left_right, up void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1386,8 +1386,8 @@ void trsm_batch(backend_selector selector, side left_right, up void trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::trsm_batch(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, @@ -1395,69 +1395,69 @@ void trsm_batch(backend_selector selector, side left_right, up } void her2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, float beta, - sycl::buffer, 1> &c, int64_t ldc) { + int64_t n, int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, float beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } void her2k(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, double beta, - sycl::buffer, 1> &c, int64_t ldc) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, double beta, + sycl::buffer, 1>& c, int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(backend_selector selector, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } -void rotg(backend_selector selector, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(backend_selector selector, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s); } void symv(backend_selector selector, uplo upper_lower, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void symv(backend_selector selector, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { oneapi::mkl::blas::rocblas::MAJOR::symv(selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1465,8 +1465,8 @@ void omatcopy_batch(backend_selector selector, transpose trans void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); @@ -1474,22 +1474,22 @@ void omatcopy_batch(backend_selector selector, transpose trans void omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch(selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1497,7 +1497,7 @@ void imatcopy_batch(backend_selector selector, transpose trans void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); @@ -1505,16 +1505,16 @@ void imatcopy_batch(backend_selector selector, transpose trans void imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1522,9 +1522,9 @@ void omatadd_batch(backend_selector selector, transpose transa } void omatadd_batch(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1533,9 +1533,9 @@ void omatadd_batch(backend_selector selector, transpose transa void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1544,10 +1544,10 @@ void omatadd_batch(backend_selector selector, transpose transa void omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch(selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, @@ -1555,123 +1555,123 @@ void omatadd_batch(backend_selector selector, transpose transa } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { oneapi::mkl::blas::rocblas::MAJOR::omatcopy2(selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } void omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } @@ -1679,239 +1679,239 @@ void omatadd(backend_selector selector, transpose transa, tran // USM APIs sycl::event syr2(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *x, int64_t incx, const float *y, int64_t incy, float *a, - int64_t lda, const std::vector &dependencies) { + float alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* a, + int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *x, int64_t incx, const double *y, int64_t incy, - double *a, int64_t lda, const std::vector &dependencies) { + double alpha, const double* x, int64_t incx, const double* y, int64_t incy, + double* a, int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } -sycl::event scal(backend_selector selector, int64_t n, float alpha, float *x, - int64_t incx, const std::vector &dependencies) { +sycl::event scal(backend_selector selector, int64_t n, float alpha, float* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } -sycl::event scal(backend_selector selector, int64_t n, double alpha, double *x, - int64_t incx, const std::vector &dependencies) { +sycl::event scal(backend_selector selector, int64_t n, double alpha, double* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, int64_t n, std::complex alpha, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, int64_t n, std::complex alpha, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, int64_t n, float alpha, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event scal(backend_selector selector, int64_t n, double alpha, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::scal(selector.get_queue(), n, alpha, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const float *a, float *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const float* a, float* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const double *a, double *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const double* a, double* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpmv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *x, int64_t incx, float *a, - const std::vector &dependencies) { + float alpha, const float* x, int64_t incx, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event spr(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *x, int64_t incx, double *a, - const std::vector &dependencies) { + double alpha, const double* x, int64_t incx, double* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event hpmv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, float beta, - float *c, int64_t ldc, const std::vector &dependencies) { + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, float beta, + float* c, int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, double beta, - double *c, int64_t ldc, const std::vector &dependencies) { + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, double beta, + double* c, int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event syrk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, int64_t *n, int64_t *k, float *alpha, const float **a, - int64_t *lda, float *beta, float **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, int64_t* n, int64_t* k, float* alpha, const float** a, + int64_t* lda, float* beta, float** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, int64_t *n, int64_t *k, double *alpha, const double **a, - int64_t *lda, double *beta, double **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, int64_t* n, int64_t* k, double* alpha, const double** a, + int64_t* lda, double* beta, double** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex* beta, + std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event syrk_batch(backend_selector selector, uplo *upper_lower, - transpose *trans, int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event syrk_batch(backend_selector selector, uplo* upper_lower, + transpose* trans, int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex* beta, + std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); @@ -1919,10 +1919,10 @@ sycl::event syrk_batch(backend_selector selector, uplo *upper_ } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, int64_t n, int64_t k, float alpha, const float *a, - int64_t lda, int64_t stride_a, float beta, float *c, int64_t ldc, + transpose trans, int64_t n, int64_t k, float alpha, const float* a, + int64_t lda, int64_t stride_a, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -1930,10 +1930,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_l } sycl::event syrk_batch(backend_selector selector, uplo upper_lower, - transpose trans, int64_t n, int64_t k, double alpha, const double *a, - int64_t lda, int64_t stride_a, double beta, double *c, int64_t ldc, + transpose trans, int64_t n, int64_t k, double alpha, const double* a, + int64_t lda, int64_t stride_a, double beta, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -1942,10 +1942,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_l sycl::event syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex beta, std::complex *c, int64_t ldc, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -1954,10 +1954,10 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_l sycl::event syrk_batch(backend_selector selector, uplo upper_lower, transpose trans, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex beta, std::complex *c, int64_t ldc, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syrk_batch( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); @@ -1965,28 +1965,28 @@ sycl::event syrk_batch(backend_selector selector, uplo upper_l } sycl::event her2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event her2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event hbmv(backend_selector selector, uplo upper_lower, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -1994,120 +1994,120 @@ sycl::event hbmv(backend_selector selector, uplo upper_lower, } sycl::event hbmv(backend_selector selector, uplo upper_lower, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } -sycl::event rot(backend_selector selector, int64_t n, std::complex *x, - int64_t incx, std::complex *y, int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, int64_t n, std::complex* x, + int64_t incx, std::complex* y, int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, int64_t n, std::complex *x, - int64_t incx, std::complex *y, int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, int64_t n, std::complex* x, + int64_t incx, std::complex* y, int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, int64_t n, float *x, int64_t incx, - float *y, int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, int64_t n, float* x, int64_t incx, + float* y, int64_t incy, float c, float s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } -sycl::event rot(backend_selector selector, int64_t n, double *x, int64_t incx, - double *y, int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(backend_selector selector, int64_t n, double* x, int64_t incx, + double* y, int64_t incy, double c, double s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rot(selector.get_queue(), n, x, incx, y, incy, c, s, dependencies); return done; } sycl::event axpy(backend_selector selector, int64_t n, float alpha, - const float *x, int64_t incx, float *y, int64_t incy, - const std::vector &dependencies) { + const float* x, int64_t incx, float* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, int64_t n, double alpha, - const double *x, int64_t incx, double *y, int64_t incy, - const std::vector &dependencies) { + const double* x, int64_t incx, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } sycl::event axpy(backend_selector selector, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy(selector.get_queue(), n, alpha, x, incx, y, incy, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, int64_t *n, float *alpha, - const float **x, int64_t *incx, float **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, int64_t* n, float* alpha, + const float** x, int64_t* incx, float** y, int64_t* incy, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, int64_t *n, double *alpha, - const double **x, int64_t *incx, double **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, int64_t* n, double* alpha, + const double** x, int64_t* incx, double** y, int64_t* incy, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, int64_t *n, - std::complex *alpha, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, int64_t* n, + std::complex* alpha, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event axpy_batch(backend_selector selector, int64_t *n, - std::complex *alpha, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(backend_selector selector, int64_t* n, + std::complex* alpha, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch( selector.get_queue(), n, alpha, x, incx, y, incy, group_count, group_size, dependencies); return done; } sycl::event axpy_batch(backend_selector selector, int64_t n, float alpha, - const float *x, int64_t incx, int64_t stridex, float *y, int64_t incy, + const float* x, int64_t incx, int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2115,9 +2115,9 @@ sycl::event axpy_batch(backend_selector selector, int64_t n, f } sycl::event axpy_batch(backend_selector selector, int64_t n, double alpha, - const double *x, int64_t incx, int64_t stridex, double *y, int64_t incy, + const double* x, int64_t incx, int64_t stridex, double* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2125,9 +2125,9 @@ sycl::event axpy_batch(backend_selector selector, int64_t n, d } sycl::event axpy_batch(backend_selector selector, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - int64_t stridex, std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + int64_t stridex, std::complex* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2135,9 +2135,9 @@ sycl::event axpy_batch(backend_selector selector, int64_t n, } sycl::event axpy_batch(backend_selector selector, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - int64_t stridex, std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + int64_t stridex, std::complex* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpy_batch(selector.get_queue(), n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); @@ -2145,61 +2145,61 @@ sycl::event axpy_batch(backend_selector selector, int64_t n, } sycl::event axpby(backend_selector selector, int64_t n, float alpha, - const float *x, int64_t incx, const float beta, float *y, int64_t incy, - const std::vector &dependencies) { + const float* x, int64_t incx, const float beta, float* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, int64_t n, double alpha, - const double *x, int64_t incx, const double beta, double *y, int64_t incy, - const std::vector &dependencies) { + const double* x, int64_t incx, const double beta, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, const std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event axpby(backend_selector selector, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::axpby(selector.get_queue(), n, alpha, x, incx, beta, y, incy, dependencies); return done; } sycl::event gerc(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event gerc(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gerc(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2207,9 +2207,9 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2217,10 +2217,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2228,10 +2228,10 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event syr2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2239,47 +2239,47 @@ sycl::event syr2k(backend_selector selector, uplo upper_lower, } sycl::event gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta, - float *y, int64_t incy, const std::vector &dependencies) { + float alpha, const float* a, int64_t lda, const float* x, int64_t incx, float beta, + float* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv(backend_selector selector, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv(selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemv_batch(backend_selector selector, transpose trans, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stridea, - const float *x, int64_t incx, int64_t stridex, float beta, float *y, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stridea, + const float* x, int64_t incx, int64_t stridex, float beta, float* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2287,10 +2287,10 @@ sycl::event gemv_batch(backend_selector selector, transpose tr } sycl::event gemv_batch(backend_selector selector, transpose trans, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stridea, - const double *x, int64_t incx, int64_t stridex, double beta, double *y, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stridea, + const double* x, int64_t incx, int64_t stridex, double beta, double* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2298,11 +2298,11 @@ sycl::event gemv_batch(backend_selector selector, transpose tr } sycl::event gemv_batch(backend_selector selector, transpose trans, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stridea, const std::complex *x, int64_t incx, - int64_t stridex, std::complex beta, std::complex *y, + int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stridea, const std::complex* x, int64_t incx, + int64_t stridex, std::complex beta, std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); @@ -2310,55 +2310,55 @@ sycl::event gemv_batch(backend_selector selector, transpose tr } sycl::event gemv_batch(backend_selector selector, transpose trans, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stridea, const std::complex *x, int64_t incx, - int64_t stridex, std::complex beta, std::complex *y, + int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stridea, const std::complex* x, int64_t incx, + int64_t stridex, std::complex beta, std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, int64_t *m, - int64_t *n, float *alpha, const float **a, int64_t *lda, const float **x, - int64_t *incx, float *beta, float **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, int64_t* m, + int64_t* n, float* alpha, const float** a, int64_t* lda, const float** x, + int64_t* incx, float* beta, float** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, int64_t *m, - int64_t *n, double *alpha, const double **a, int64_t *lda, const double **x, - int64_t *incx, double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, int64_t* m, + int64_t* n, double* alpha, const double** a, int64_t* lda, const double** x, + int64_t* incx, double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, int64_t *m, - int64_t *n, std::complex *alpha, const std::complex **a, - int64_t *lda, const std::complex **x, int64_t *incx, - std::complex *beta, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, int64_t* m, + int64_t* n, std::complex* alpha, const std::complex** a, + int64_t* lda, const std::complex** x, int64_t* incx, + std::complex* beta, std::complex** y, int64_t* incy, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); return done; } -sycl::event gemv_batch(backend_selector selector, transpose *trans, int64_t *m, - int64_t *n, std::complex *alpha, const std::complex **a, - int64_t *lda, const std::complex **x, int64_t *incx, - std::complex *beta, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(backend_selector selector, transpose* trans, int64_t* m, + int64_t* n, std::complex* alpha, const std::complex** a, + int64_t* lda, const std::complex** x, int64_t* incx, + std::complex* beta, std::complex** y, int64_t* incy, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemv_batch( selector.get_queue(), trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); @@ -2366,9 +2366,9 @@ sycl::event gemv_batch(backend_selector selector, transpose *t } sycl::event dgmm_batch(backend_selector selector, side left_right, int64_t m, - int64_t n, const float *a, int64_t lda, int64_t stridea, const float *x, - int64_t incx, int64_t stridex, float *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { + int64_t n, const float* a, int64_t lda, int64_t stridea, const float* x, + int64_t incx, int64_t stridex, float* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2376,9 +2376,9 @@ sycl::event dgmm_batch(backend_selector selector, side left_ri } sycl::event dgmm_batch(backend_selector selector, side left_right, int64_t m, - int64_t n, const double *a, int64_t lda, int64_t stridea, const double *x, - int64_t incx, int64_t stridex, double *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { + int64_t n, const double* a, int64_t lda, int64_t stridea, const double* x, + int64_t incx, int64_t stridex, double* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2386,10 +2386,10 @@ sycl::event dgmm_batch(backend_selector selector, side left_ri } sycl::event dgmm_batch(backend_selector selector, side left_right, int64_t m, - int64_t n, const std::complex *a, int64_t lda, int64_t stridea, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies) { + int64_t n, const std::complex* a, int64_t lda, int64_t stridea, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); @@ -2397,52 +2397,52 @@ sycl::event dgmm_batch(backend_selector selector, side left_ri } sycl::event dgmm_batch(backend_selector selector, side left_right, int64_t m, - int64_t n, const std::complex *a, int64_t lda, int64_t stridea, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies) { + int64_t n, const std::complex* a, int64_t lda, int64_t stridea, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, int64_t *m, - int64_t *n, const float **a, int64_t *lda, const float **x, int64_t *incx, - float **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, int64_t* m, + int64_t* n, const float** a, int64_t* lda, const float** x, int64_t* incx, + float** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, int64_t *m, - int64_t *n, const double **a, int64_t *lda, const double **x, int64_t *incx, - double **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, int64_t* m, + int64_t* n, const double** a, int64_t* lda, const double** x, int64_t* incx, + double** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, int64_t *m, - int64_t *n, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, int64_t* m, + int64_t* n, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event dgmm_batch(backend_selector selector, side *left_right, int64_t *m, - int64_t *n, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(backend_selector selector, side* left_right, int64_t* m, + int64_t* n, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dgmm_batch( selector.get_queue(), left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); @@ -2450,151 +2450,151 @@ sycl::event dgmm_batch(backend_selector selector, side *left_r } sycl::event her(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const std::complex *x, int64_t incx, std::complex *a, - int64_t lda, const std::vector &dependencies) { + float alpha, const std::complex* x, int64_t incx, std::complex* a, + int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event her(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const std::complex *x, int64_t incx, std::complex *a, - int64_t lda, const std::vector &dependencies) { + double alpha, const std::complex* x, int64_t incx, std::complex* a, + int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const std::complex *x, int64_t incx, std::complex *a, - const std::vector &dependencies) { + float alpha, const std::complex* x, int64_t incx, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } sycl::event hpr(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const std::complex *x, int64_t incx, std::complex *a, - const std::vector &dependencies) { + double alpha, const std::complex* x, int64_t incx, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, dependencies); return done; } -sycl::event iamin(backend_selector selector, int64_t n, const float *x, - int64_t incx, int64_t *result, const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, int64_t n, const float* x, + int64_t incx, int64_t* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamin(backend_selector selector, int64_t n, const double *x, - int64_t incx, int64_t *result, const std::vector &dependencies) { +sycl::event iamin(backend_selector selector, int64_t n, const double* x, + int64_t incx, int64_t* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamin(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamin(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha, - const float **a, int64_t *lda, const float **b, int64_t *ldb, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, float* alpha, + const float** a, int64_t* lda, const float** b, int64_t* ldb, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, double *alpha, - const double **a, int64_t *lda, const double **b, int64_t *ldb, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, double* alpha, + const double** a, int64_t* lda, const double** b, int64_t* ldb, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **b, int64_t *ldb, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** b, int64_t* ldb, std::complex* beta, + std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **b, int64_t *ldb, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** b, int64_t* ldb, std::complex* beta, + std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, sycl::half *alpha, - const sycl::half **a, int64_t *lda, const sycl::half **b, int64_t *ldb, - sycl::half *beta, sycl::half **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, sycl::half* alpha, + const sycl::half** a, int64_t* lda, const sycl::half** b, int64_t* ldb, + sycl::half* beta, sycl::half** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha, - const sycl::half **a, int64_t *lda, const sycl::half **b, int64_t *ldb, - float *beta, float **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, float* alpha, + const sycl::half** a, int64_t* lda, const sycl::half** b, int64_t* ldb, + float* beta, float** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha, - const std::int8_t **a, int64_t *lda, const std::int8_t **b, int64_t *ldb, - float *beta, float **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, float* alpha, + const std::int8_t** a, int64_t* lda, const std::int8_t** b, int64_t* ldb, + float* beta, float** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); return done; } -sycl::event gemm_batch(backend_selector selector, transpose *transa, - transpose *transb, int64_t *m, int64_t *n, int64_t *k, float *alpha, - const std::int8_t **a, int64_t *lda, const std::int8_t **b, int64_t *ldb, - float *beta, std::int32_t **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(backend_selector selector, transpose* transa, + transpose* transb, int64_t* m, int64_t* n, int64_t* k, float* alpha, + const std::int8_t** a, int64_t* lda, const std::int8_t** b, int64_t* ldb, + float* beta, std::int32_t** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); @@ -2603,9 +2603,9 @@ sycl::event gemm_batch(backend_selector selector, transpose *t sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *b, int64_t ldb, - int64_t stride_b, float beta, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const float* a, int64_t lda, int64_t stride_a, const float* b, int64_t ldb, + int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2614,9 +2614,9 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *b, int64_t ldb, - int64_t stride_b, double beta, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const double* a, int64_t lda, int64_t stride_a, const double* b, int64_t ldb, + int64_t stride_b, double beta, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2625,11 +2625,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex beta, std::complex *c, int64_t ldc, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2638,11 +2638,11 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *b, int64_t ldb, - int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2651,10 +2651,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, sycl::half alpha, - const sycl::half *a, int64_t lda, int64_t stride_a, const sycl::half *b, - int64_t ldb, int64_t stride_b, sycl::half beta, sycl::half *c, int64_t ldc, + const sycl::half* a, int64_t lda, int64_t stride_a, const sycl::half* b, + int64_t ldb, int64_t stride_b, sycl::half beta, sycl::half* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2663,10 +2663,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, - const sycl::half *a, int64_t lda, int64_t stride_a, const sycl::half *b, - int64_t ldb, int64_t stride_b, float beta, float *c, int64_t ldc, + const sycl::half* a, int64_t lda, int64_t stride_a, const sycl::half* b, + int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2675,10 +2675,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, - const std::int8_t *a, int64_t lda, int64_t stride_a, const std::int8_t *b, - int64_t ldb, int64_t stride_b, float beta, float *c, int64_t ldc, + const std::int8_t* a, int64_t lda, int64_t stride_a, const std::int8_t* b, + int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2687,10 +2687,10 @@ sycl::event gemm_batch(backend_selector selector, transpose tr sycl::event gemm_batch(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, float alpha, - const std::int8_t *a, int64_t lda, int64_t stride_a, const std::int8_t *b, - int64_t ldb, int64_t stride_b, float beta, std::int32_t *c, int64_t ldc, + const std::int8_t* a, int64_t lda, int64_t stride_a, const std::int8_t* b, + int64_t ldb, int64_t stride_b, float beta, std::int32_t* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_batch( selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); @@ -2698,103 +2698,103 @@ sycl::event gemm_batch(backend_selector selector, transpose tr } sycl::event spmv(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *a, const float *x, int64_t incx, float beta, float *y, - int64_t incy, const std::vector &dependencies) { + float alpha, const float* a, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } sycl::event spmv(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *a, const double *x, int64_t incx, double beta, - double *y, int64_t incy, const std::vector &dependencies) { + double alpha, const double* a, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spmv(selector.get_queue(), upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, int64_t n, float *x, int64_t incx, - float *y, int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, int64_t n, float* x, int64_t incx, + float* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, int64_t n, double *x, int64_t incx, - double *y, int64_t incy, const std::vector &dependencies) { +sycl::event swap(backend_selector selector, int64_t n, double* x, int64_t incx, + double* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, int64_t n, std::complex *x, - int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, int64_t n, std::complex* x, + int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event swap(backend_selector selector, int64_t n, std::complex *x, - int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event swap(backend_selector selector, int64_t n, std::complex* x, + int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::swap(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event geru(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event geru(backend_selector selector, int64_t m, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::geru(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event nrm2(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event nrm2(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, int64_t n, const float *x, - int64_t incx, float *result, const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, int64_t n, const float* x, + int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event nrm2(backend_selector selector, int64_t n, const double *x, - int64_t incx, double *result, const std::vector &dependencies) { +sycl::event nrm2(backend_selector selector, int64_t n, const double* x, + int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::nrm2(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, - const float *b, int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, int64_t k, float alpha, const float* a, int64_t lda, + const float* b, int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2802,9 +2802,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, double alpha, const double *a, int64_t lda, - const double *b, int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, int64_t k, double alpha, const double* a, int64_t lda, + const double* b, int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2813,9 +2813,9 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *b, - int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, const std::complex* b, + int64_t ldb, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2824,9 +2824,9 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm(backend_selector selector, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *b, - int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, const std::complex* b, + int64_t ldb, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2834,9 +2834,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, sycl::half alpha, const sycl::half *a, - int64_t lda, const sycl::half *b, int64_t ldb, sycl::half beta, sycl::half *c, - int64_t ldc, const std::vector &dependencies) { + int64_t m, int64_t n, int64_t k, sycl::half alpha, const sycl::half* a, + int64_t lda, const sycl::half* b, int64_t ldb, sycl::half beta, sycl::half* c, + int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2844,9 +2844,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, const sycl::half *a, int64_t lda, - const sycl::half *b, int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, int64_t k, float alpha, const sycl::half* a, int64_t lda, + const sycl::half* b, int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2854,9 +2854,9 @@ sycl::event gemm(backend_selector selector, transpose transa, } sycl::event gemm(backend_selector selector, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, float alpha, const bfloat16 *a, int64_t lda, - const bfloat16 *b, int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, int64_t k, float alpha, const bfloat16* a, int64_t lda, + const bfloat16* b, int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm(selector.get_queue(), transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -2865,10 +2865,10 @@ sycl::event gemm(backend_selector selector, transpose transa, sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, - float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, int64_t lda, std::int8_t ao, + const std::uint8_t* b, int64_t ldb, std::uint8_t bo, float beta, + std::int32_t* c, int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2877,10 +2877,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tra sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, - float alpha, const std::int8_t *a, int64_t lda, std::int8_t ao, - const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, int64_t lda, std::int8_t ao, + const std::int8_t* b, int64_t ldb, std::int8_t bo, float beta, + std::int32_t* c, int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2889,10 +2889,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tra sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, - float alpha, const std::uint8_t *a, int64_t lda, std::uint8_t ao, - const std::int8_t *b, int64_t ldb, std::int8_t bo, float beta, - std::int32_t *c, int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + float alpha, const std::uint8_t* a, int64_t lda, std::uint8_t ao, + const std::int8_t* b, int64_t ldb, std::int8_t bo, float beta, + std::int32_t* c, int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2901,10 +2901,10 @@ sycl::event gemm_bias(backend_selector selector, transpose tra sycl::event gemm_bias(backend_selector selector, transpose transa, transpose transb, offset offsetc, int64_t m, int64_t n, int64_t k, - float alpha, const std::uint8_t *a, int64_t lda, std::uint8_t ao, - const std::uint8_t *b, int64_t ldb, std::uint8_t bo, float beta, - std::int32_t *c, int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + float alpha, const std::uint8_t* a, int64_t lda, std::uint8_t ao, + const std::uint8_t* b, int64_t ldb, std::uint8_t bo, float beta, + std::int32_t* c, int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemm_bias( selector.get_queue(), transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); @@ -2912,42 +2912,42 @@ sycl::event gemm_bias(backend_selector selector, transpose tra } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, float alpha, const std::complex *a, int64_t lda, - float beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, float alpha, const std::complex* a, int64_t lda, + float beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event herk(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, double alpha, const std::complex *a, int64_t lda, - double beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, double alpha, const std::complex* a, int64_t lda, + double beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::herk( selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); return done; } sycl::event ger(backend_selector selector, int64_t m, int64_t n, float alpha, - const float *x, int64_t incx, const float *y, int64_t incy, float *a, int64_t lda, - const std::vector &dependencies) { + const float* x, int64_t incx, const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event ger(backend_selector selector, int64_t m, int64_t n, double alpha, - const double *x, int64_t incx, const double *y, int64_t incy, double *a, - int64_t lda, const std::vector &dependencies) { + const double* x, int64_t incx, const double* y, int64_t incy, double* a, + int64_t lda, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::ger(selector.get_queue(), m, n, alpha, x, incx, y, incy, a, lda, dependencies); return done; } sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, float *b, int64_t ldb, const std::vector &dependencies) { + transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, float* b, int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -2956,8 +2956,8 @@ sycl::event trsm(backend_selector selector, side left_right, u sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies) { + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -2966,8 +2966,8 @@ sycl::event trsm(backend_selector selector, side left_right, u sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -2976,8 +2976,8 @@ sycl::event trsm(backend_selector selector, side left_right, u sycl::event trsm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -2986,9 +2986,9 @@ sycl::event trsm(backend_selector selector, side left_right, u sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, int64_t stride_a, float *b, + float alpha, const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -2997,9 +2997,9 @@ sycl::event trsm_batch(backend_selector selector, side left_ri sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, int64_t stride_a, double *b, + double alpha, const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3008,9 +3008,9 @@ sycl::event trsm_batch(backend_selector selector, side left_ri sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3019,53 +3019,53 @@ sycl::event trsm_batch(backend_selector selector, side left_ri sycl::event trsm_batch(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, + float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, + double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); return done; } -sycl::event trsm_batch(backend_selector selector, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(backend_selector selector, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsm_batch( selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); @@ -3073,28 +3073,28 @@ sycl::event trsm_batch(backend_selector selector, side *left_r } sycl::event dotu(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotu(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dotu(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3102,10 +3102,10 @@ sycl::event hemm(backend_selector selector, side left_right, u } sycl::event hemm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hemm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3113,27 +3113,27 @@ sycl::event hemm(backend_selector selector, side left_right, u } sycl::event hpr2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event hpr2(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *a, - const std::vector &dependencies) { + std::complex alpha, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hpr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, float alpha, const float *a, int64_t lda, const float *x, - int64_t incx, float beta, float *y, int64_t incy, - const std::vector &dependencies) { + int64_t kl, int64_t ku, float alpha, const float* a, int64_t lda, const float* x, + int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3141,9 +3141,9 @@ sycl::event gbmv(backend_selector selector, transpose trans, i } sycl::event gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, double alpha, const double *a, int64_t lda, - const double *x, int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies) { + int64_t kl, int64_t ku, double alpha, const double* a, int64_t lda, + const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3151,10 +3151,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, i } sycl::event gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + int64_t kl, int64_t ku, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3162,10 +3162,10 @@ sycl::event gbmv(backend_selector selector, transpose trans, i } sycl::event gbmv(backend_selector selector, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *x, int64_t incx, - std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { + int64_t kl, int64_t ku, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gbmv(selector.get_queue(), trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3173,43 +3173,43 @@ sycl::event gbmv(backend_selector selector, transpose trans, i } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const float *a, int64_t lda, float *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const float* a, int64_t lda, float* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const double *a, int64_t lda, double *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const double* a, int64_t lda, double* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbmv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbmv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3217,9 +3217,9 @@ sycl::event symm(backend_selector selector, side left_right, u } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3227,10 +3227,10 @@ sycl::event symm(backend_selector selector, side left_right, u } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3238,10 +3238,10 @@ sycl::event symm(backend_selector selector, side left_right, u } sycl::event symm(backend_selector selector, side left_right, uplo upper_lower, - int64_t m, int64_t n, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t m, int64_t n, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symm(selector.get_queue(), left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3249,42 +3249,42 @@ sycl::event symm(backend_selector selector, side left_right, u } sycl::event dotc(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event dotc(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dotc(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *x, int64_t incx, float *a, int64_t lda, - const std::vector &dependencies) { + float alpha, const float* x, int64_t incx, float* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event syr(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *x, int64_t incx, double *a, int64_t lda, - const std::vector &dependencies) { + double alpha, const double* x, int64_t incx, double* a, int64_t lda, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::syr(selector.get_queue(), upper_lower, n, alpha, x, incx, a, lda, dependencies); return done; } sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, float *b, int64_t ldb, const std::vector &dependencies) { + transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, float* b, int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3293,8 +3293,8 @@ sycl::event trmm(backend_selector selector, side left_right, u sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies) { + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3303,8 +3303,8 @@ sycl::event trmm(backend_selector selector, side left_right, u sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); @@ -3313,218 +3313,218 @@ sycl::event trmm(backend_selector selector, side left_right, u sycl::event trmm(backend_selector selector, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trmm(selector.get_queue(), left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); return done; } -sycl::event rotmg(backend_selector selector, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } -sycl::event rotmg(backend_selector selector, double *d1, double *d2, double *x1, - double y1, double *param, const std::vector &dependencies) { +sycl::event rotmg(backend_selector selector, double* d1, double* d2, double* x1, + double y1, double* param, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotmg(selector.get_queue(), d1, d2, x1, y1, param, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const float *a, float *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const float* a, float* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const double *a, double *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const double* a, double* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event tpsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tpsv(selector.get_queue(), upper_lower, trans, unit_diag, n, a, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } sycl::event trsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::trsv( selector.get_queue(), upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); return done; } -sycl::event copy(backend_selector selector, int64_t n, const float *x, - int64_t incx, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, int64_t n, const float* x, + int64_t incx, float* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy(backend_selector selector, int64_t n, const double *x, - int64_t incx, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event copy(backend_selector selector, int64_t n, const double* x, + int64_t incx, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } sycl::event copy(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy(selector.get_queue(), n, x, incx, y, incy, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t *n, const float **x, - int64_t *incx, float **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t* n, const float** x, + int64_t* incx, float** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t *n, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t* n, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t *n, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t* n, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t *n, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t* n, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, y, incy, group_count, group_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t n, const float *x, - int64_t incx, int64_t stridex, float *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t n, const float* x, + int64_t incx, int64_t stridex, float* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } -sycl::event copy_batch(backend_selector selector, int64_t n, const double *x, - int64_t incx, int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(backend_selector selector, int64_t n, const double* x, + int64_t incx, int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event copy_batch(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::copy_batch( selector.get_queue(), n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event hemv(backend_selector selector, uplo upper_lower, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::hemv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, int64_t n, int64_t k, float alpha, const float *a, int64_t lda, - const float *b, int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { + transpose transb, int64_t n, int64_t k, float alpha, const float* a, int64_t lda, + const float* b, int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3532,9 +3532,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, - transpose transb, int64_t n, int64_t k, double alpha, const double *a, - int64_t lda, const double *b, int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { + transpose transb, int64_t n, int64_t k, double alpha, const double* a, + int64_t lda, const double* b, int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3543,9 +3543,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *b, - int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, const std::complex* b, + int64_t ldb, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3554,9 +3554,9 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, sycl::event gemmt(backend_selector selector, uplo upper_lower, transpose transa, transpose transb, int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *b, - int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, const std::complex* b, + int64_t ldb, std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::gemmt(selector.get_queue(), upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3564,8 +3564,8 @@ sycl::event gemmt(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, const float *x, int64_t incx, - float beta, float *y, int64_t incy, const std::vector &dependencies) { + int64_t k, float alpha, const float* a, int64_t lda, const float* x, int64_t incx, + float beta, float* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3573,9 +3573,9 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, } sycl::event sbmv(backend_selector selector, uplo upper_lower, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, const double *x, - int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies) { + int64_t k, double alpha, const double* a, int64_t lda, const double* x, + int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::sbmv(selector.get_queue(), upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); @@ -3583,174 +3583,174 @@ sycl::event sbmv(backend_selector selector, uplo upper_lower, } sycl::event asum(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, float *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event asum(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, double *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, int64_t n, const float *x, - int64_t incx, float *result, const std::vector &dependencies) { +sycl::event asum(backend_selector selector, int64_t n, const float* x, + int64_t incx, float* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event asum(backend_selector selector, int64_t n, const double *x, - int64_t incx, double *result, const std::vector &dependencies) { +sycl::event asum(backend_selector selector, int64_t n, const double* x, + int64_t incx, double* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::asum(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const float *a, int64_t lda, float *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const float* a, int64_t lda, float* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const double *a, int64_t lda, double *x, - int64_t incx, const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const double* a, int64_t lda, double* x, + int64_t incx, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event tbsv(backend_selector selector, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::tbsv( selector.get_queue(), upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *x, int64_t incx, const float *y, int64_t incy, float *a, - const std::vector &dependencies) { + float alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* a, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } sycl::event spr2(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *x, int64_t incx, const double *y, int64_t incy, - double *a, const std::vector &dependencies) { + double alpha, const double* x, int64_t incx, const double* y, int64_t incy, + double* a, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::spr2(selector.get_queue(), upper_lower, n, alpha, x, incx, y, incy, a, dependencies); return done; } -sycl::event iamax(backend_selector selector, int64_t n, const float *x, - int64_t incx, int64_t *result, const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, int64_t n, const float* x, + int64_t incx, int64_t* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event iamax(backend_selector selector, int64_t n, const double *x, - int64_t incx, int64_t *result, const std::vector &dependencies) { +sycl::event iamax(backend_selector selector, int64_t n, const double* x, + int64_t incx, int64_t* result, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } sycl::event iamax(backend_selector selector, int64_t n, - const std::complex *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { + const std::complex* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::iamax(selector.get_queue(), n, x, incx, result, dependencies); return done; } -sycl::event rotm(backend_selector selector, int64_t n, float *x, int64_t incx, - float *y, int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, int64_t n, float* x, int64_t incx, + float* y, int64_t incy, float* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotm(backend_selector selector, int64_t n, double *x, int64_t incx, - double *y, int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(backend_selector selector, int64_t n, double* x, int64_t incx, + double* y, int64_t incy, double* param, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotm(selector.get_queue(), n, x, incx, y, incy, param, dependencies); return done; } -sycl::event rotg(backend_selector selector, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, float* a, float* b, float* c, + float* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event rotg(backend_selector selector, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(backend_selector selector, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::rotg(selector.get_queue(), a, b, c, s, dependencies); return done; } -sycl::event sdsdot(backend_selector selector, int64_t n, float sb, const float *x, - int64_t incx, const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(backend_selector selector, int64_t n, float sb, const float* x, + int64_t incx, const float* y, int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::sdsdot(selector.get_queue(), n, sb, x, incx, y, incy, result, dependencies); return done; } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, float beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, float beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); @@ -3758,62 +3758,62 @@ sycl::event her2k(backend_selector selector, uplo upper_lower, } sycl::event her2k(backend_selector selector, uplo upper_lower, transpose trans, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, double beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, double beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::her2k(selector.get_queue(), upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); return done; } -sycl::event dot(backend_selector selector, int64_t n, const float *x, - int64_t incx, const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, int64_t n, const float* x, + int64_t incx, const float* y, int64_t incy, float* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, int64_t n, const double *x, - int64_t incx, const double *y, int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, int64_t n, const double* x, + int64_t incx, const double* y, int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } -sycl::event dot(backend_selector selector, int64_t n, const float *x, - int64_t incx, const float *y, int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(backend_selector selector, int64_t n, const float* x, + int64_t incx, const float* y, int64_t incy, double* result, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::dot(selector.get_queue(), n, x, incx, y, incy, result, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, int64_t n, - float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta, - float *y, int64_t incy, const std::vector &dependencies) { + float alpha, const float* a, int64_t lda, const float* x, int64_t incx, float beta, + float* y, int64_t incy, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event symv(backend_selector selector, uplo upper_lower, int64_t n, - double alpha, const double *a, int64_t lda, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { + double alpha, const double* a, int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::symv( selector.get_queue(), upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); return done; } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3821,10 +3821,10 @@ sycl::event omatcopy_batch(backend_selector selector, transpos } sycl::event omatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3833,9 +3833,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpos sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3844,9 +3844,9 @@ sycl::event omatcopy_batch(backend_selector selector, transpos sycl::event omatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); @@ -3854,18 +3854,18 @@ sycl::event omatcopy_batch(backend_selector selector, transpos } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; } sycl::event imatcopy_batch(backend_selector selector, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -3873,9 +3873,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpos sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -3883,9 +3883,9 @@ sycl::event imatcopy_batch(backend_selector selector, transpos sycl::event imatcopy_batch(backend_selector selector, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch( selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); return done; @@ -3893,10 +3893,10 @@ sycl::event imatcopy_batch(backend_selector selector, transpos sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -3905,10 +3905,10 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -3917,11 +3917,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -3930,11 +3930,11 @@ sycl::event omatadd_batch(backend_selector selector, transpose sycl::event omatadd_batch(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd_batch( selector.get_queue(), transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); @@ -3942,115 +3942,115 @@ sycl::event omatadd_batch(backend_selector selector, transpose } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy(selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event omatcopy2(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy2( selector.get_queue(), trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event imatcopy(backend_selector selector, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, dependencies); return done; } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float beta, const float *b, std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float beta, const float* b, std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4058,9 +4058,9 @@ sycl::event omatadd(backend_selector selector, transpose trans } sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - double beta, const double *b, std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies) { + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + double beta, const double* b, std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4069,9 +4069,9 @@ sycl::event omatadd(backend_selector selector, transpose trans sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4080,9 +4080,9 @@ sycl::event omatadd(backend_selector selector, transpose trans sycl::event omatadd(backend_selector selector, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatadd(selector.get_queue(), transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); @@ -4115,8 +4115,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpos std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4127,8 +4126,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpos std::int64_t* m, std::int64_t* n, std::complex* alpha, const std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::omatcopy_batch( selector.get_queue(), trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); @@ -4138,8 +4136,7 @@ sycl::event omatcopy_batch(backend_selector selector, transpos sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); @@ -4149,8 +4146,7 @@ sycl::event imatcopy_batch(backend_selector selector, transpos sycl::event imatcopy_batch(backend_selector selector, transpose* trans, std::int64_t* m, std::int64_t* n, double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t* groupsize, - const std::vector& dependencies) { + std::int64_t* groupsize, const std::vector& dependencies) { auto done = oneapi::mkl::blas::rocblas::MAJOR::imatcopy_batch(selector.get_queue(), trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); diff --git a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx b/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx index 70aabaaf9..f6c3eeee5 100644 --- a/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx +++ b/include/oneapi/mkl/blas/detail/rocblas/onemkl_blas_rocblas.hxx @@ -21,2102 +21,2102 @@ // Buffer APIs -void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void axpy(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void axpy(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void axpy(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void axpy(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void axpy(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void axpy(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy); +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy); -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy); +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy); -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result); -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result); -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result); -void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result); -void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result); -void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result); -void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, float c, float s); +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, float c, float s); -void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, double c, double s); +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, double c, double s); -void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, float c, float s); +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, float c, float s); -void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, double c, double s); +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, double c, double s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param); -void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param); -void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx); +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx); -void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx); +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx); -void scal(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx); +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx); -void scal(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx); +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx); -void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer, 1>& x, int64_t incx); -void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer, 1>& x, int64_t incx); -void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result); +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result); -void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy); +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy); -void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy); +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, float beta, sycl::buffer &y, int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, float beta, sycl::buffer& y, int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, double beta, sycl::buffer &y, int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, double beta, sycl::buffer& y, int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy); +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy); -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy); +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy); -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); -void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &x, - int64_t incx, int64_t stridex, float beta, sycl::buffer &y, int64_t incy, +void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& x, + int64_t incx, int64_t stridex, float beta, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, double beta, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size); +void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, double beta, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stridea, sycl::buffer, 1> &x, int64_t incx, - int64_t stridex, std::complex beta, sycl::buffer, 1> &y, +void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stridea, sycl::buffer, 1>& x, int64_t incx, + int64_t stridex, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stridea, sycl::buffer, 1> &x, int64_t incx, +void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stridea, sycl::buffer, 1>& x, int64_t incx, int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &c, int64_t ldc, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size); +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stridea, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &c, int64_t ldc, int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stridea, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& c, int64_t ldc, int64_t stridec, int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stridea, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &c, int64_t ldc, int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stridea, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& c, int64_t ldc, int64_t stridec, int64_t batch_size); -void ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda); -void ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda); -void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); - -void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy); - -void her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda); - -void her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda); - -void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda); - -void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); + +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy); + +void her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda); + +void her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda); + +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, int64_t incy); -void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, int64_t incy); -void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a); -void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a); -void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy); -void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy); -void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &a, - sycl::buffer &x, int64_t incx, float beta, sycl::buffer &y, +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& a, + sycl::buffer& x, int64_t incx, float beta, sycl::buffer& y, int64_t incy); -void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &a, - sycl::buffer &x, int64_t incx, double beta, sycl::buffer &y, +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& a, + sycl::buffer& x, int64_t incx, double beta, sycl::buffer& y, int64_t incy); -void spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a); -void spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a); -void symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx, float beta, - sycl::buffer &y, int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, float beta, + sycl::buffer& y, int64_t incy); -void symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx, double beta, - sycl::buffer &y, int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy); -void syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a, int64_t lda); +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda); -void syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a, int64_t lda); +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, sycl::buffer &x, int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, sycl::buffer& x, int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, sycl::buffer &x, int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, sycl::buffer& x, int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, sycl::buffer &x, int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, sycl::buffer& x, int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, sycl::buffer &x, int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, sycl::buffer& x, int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx); - -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx); +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); + +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); - -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); - -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx); - -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx); - -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); - -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, double beta, sycl::buffer &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - sycl::half alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, sycl::half beta, - sycl::buffer &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc); - -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer, 1> &a, int64_t lda, float beta, - sycl::buffer, 1> &c, int64_t ldc); - -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, - sycl::buffer, 1> &a, int64_t lda, double beta, - sycl::buffer, 1> &c, int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, float beta, - sycl::buffer, 1> &c, int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, double beta, - sycl::buffer, 1> &c, int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - float beta, sycl::buffer &c, int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - double beta, sycl::buffer &c, int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); - -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, float beta, sycl::buffer &c, +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); + +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx); + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); + +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + sycl::half alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, sycl::half beta, + sycl::buffer& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc); + +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc); + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer, 1>& a, int64_t lda, float beta, + sycl::buffer, 1>& c, int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer, 1>& a, int64_t lda, double beta, + sycl::buffer, 1>& c, int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, float beta, + sycl::buffer, 1>& c, int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, double beta, + sycl::buffer, 1>& c, int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + double beta, sycl::buffer& c, int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, float beta, sycl::buffer& c, int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, - sycl::buffer &a, int64_t lda, double beta, sycl::buffer &c, +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, double beta, sycl::buffer& c, int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc); +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc); -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc); +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - float beta, sycl::buffer &c, int64_t ldc); +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc); -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, double beta, sycl::buffer &c, int64_t ldc); +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc); -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc); +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb); +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb); +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb); +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb); +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb); +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb); +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); - -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); - -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, double beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); - -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, - int64_t stride_b, std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, double beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); + +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, + int64_t stride_b, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc); - -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc); - -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co); - -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co); - -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co); - -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co); - -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size); - -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size); - -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc); + +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc); + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co); + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co); + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co); + +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co); + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size); + +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, float beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, double beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc); // USM APIs -sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - float *result, const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - double *result, const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result, - const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result, - const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, float *y, - int64_t incy, const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double *y, int64_t incy, const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double* y, int64_t incy, const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - const float beta, float *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + const float beta, float* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - const double beta, double *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + const double beta, double* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y, - int64_t incy, const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y, - int64_t incy, const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* y, + int64_t incy, const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t stridex, - float *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t stridex, + float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t stridex, std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t stridex, std::complex* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t stridex, std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t stridex, std::complex* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, float *result, const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, float* result, const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, int64_t n, const double *x, int64_t incx, const double *y, - int64_t incy, double *result, const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, int64_t n, const double* x, int64_t incx, const double* y, + int64_t incy, double* result, const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies = {}); -sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies = {}); -sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies = {}); -sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies = {}); -sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - float *result, const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - double *result, const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result, - const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result, - const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, float c, float s, - const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, float c, float s, + const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, double c, double s, - const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, double c, double s, + const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy, - float c, float s, const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float c, float s, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - double c, double s, const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double c, double s, const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, - const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, float *c, - std::complex *s, const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, double *c, - std::complex *s, const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy, - float *param, const std::vector &dependencies = {}); +sycl::event rotm(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float* param, const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - double *param, const std::vector &dependencies = {}); +sycl::event rotm(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double* param, const std::vector& dependencies = {}); -sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, - const std::vector &dependencies = {}); +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, float* param, + const std::vector& dependencies = {}); -sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, - const std::vector &dependencies = {}); +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, double* param, + const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx, - const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, float* x, int64_t incx, + const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx, - const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, double* x, int64_t incx, + const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, std::complex alpha, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, std::complex alpha, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, float alpha, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); -sycl::event scal(sycl::queue &queue, int64_t n, double alpha, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); -sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, - const float *y, int64_t incy, float *result, - const std::vector &dependencies = {}); +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta, - float *y, int64_t incy, const std::vector &dependencies = {}); +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, const float* a, int64_t lda, const float* x, int64_t incx, float beta, + float* y, int64_t incy, const std::vector& dependencies = {}); -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - double alpha, const double *a, int64_t lda, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, const double* a, int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y, - int64_t incy, const std::vector &dependencies = {}); +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies = {}); -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, const double *x, int64_t incx, double beta, - double *y, int64_t incy, const std::vector &dependencies = {}); +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies = {}); -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stridea, const float *x, int64_t incx, - int64_t stridex, float beta, float *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stridea, const float* x, int64_t incx, + int64_t stridex, float beta, float* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stridea, const double *x, int64_t incx, - int64_t stridex, double beta, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stridea, const double* x, int64_t incx, + int64_t stridex, double beta, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stridea, const std::complex *x, int64_t incx, int64_t stridex, - std::complex beta, std::complex *y, int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stridea, const std::complex* x, int64_t incx, int64_t stridex, + std::complex beta, std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stridea, const std::complex *x, int64_t incx, - int64_t stridex, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stridea, const std::complex* x, int64_t incx, + int64_t stridex, std::complex beta, std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stridea, const float *x, int64_t incx, int64_t stridex, - float *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stridea, const double *x, int64_t incx, int64_t stridex, - double *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stridea, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stridea, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x, int64_t incx, - const float *y, int64_t incy, float *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *x, - int64_t incx, std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *x, - int64_t incx, std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - const std::complex *x, int64_t incx, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - const std::complex *x, int64_t incx, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, int64_t incx, - std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, int64_t incx, - std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - const std::complex *x, int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - const std::complex *x, int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha, - const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y, - int64_t incy, const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha, - const double *a, int64_t lda, const double *x, int64_t incx, double beta, - double *y, int64_t incy, const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a, - const float *x, int64_t incx, float beta, float *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a, - const double *x, int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, float *a, const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, double *a, const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, const float *y, int64_t incy, float *a, - const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *a, - const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a, - int64_t lda, const float *x, int64_t incx, float beta, float *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a, - int64_t lda, const double *x, int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, float *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, double *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, const float *y, int64_t incy, float *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *a, int64_t lda, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *a, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *a, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *a, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *a, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, - float beta, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, const sycl::half *b, - int64_t ldb, sycl::half beta, sycl::half *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const sycl::half *a, int64_t lda, const sycl::half *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const std::complex *a, int64_t lda, float beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const std::complex *a, int64_t lda, double beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, float beta, std::complex *c, - int64_t ldc, const std::vector &dependencies = {}); - -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, double beta, std::complex *c, - int64_t ldc, const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta, - float *c, int64_t ldc, const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, const double *b, int64_t ldb, - double beta, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, double beta, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stridea, const float* x, int64_t incx, int64_t stridex, + float* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stridea, const double* x, int64_t incx, int64_t stridex, + double* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stridea, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stridea, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* c, int64_t ldc, int64_t stridec, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, const float* x, int64_t incx, + const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* a, + const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* a, + const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* a, const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* a, const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* a, + const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, + const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* a, + int64_t lda, const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* a, + int64_t lda, const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, + float beta, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, sycl::half beta, sycl::half* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const bfloat16* a, int64_t lda, const bfloat16* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const std::complex* a, int64_t lda, float beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const std::complex* a, int64_t lda, double beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, float beta, std::complex* c, + int64_t ldc, const std::vector& dependencies = {}); + +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, double beta, std::complex* c, + int64_t ldc, const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, float beta, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, double beta, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta, - float *c, int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, const double *b, int64_t ldb, - double beta, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, - float *b, int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda, - double *b, int64_t ldb, const std::vector &dependencies = {}); - -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies = {}); + +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, - float *b, int64_t ldb, const std::vector &dependencies = {}); +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda, - double *b, int64_t ldb, const std::vector &dependencies = {}); +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, float *alpha, const float **a, - int64_t *lda, float **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, double *alpha, const double **a, - int64_t *lda, double **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex **b, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex **b, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda, - const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda, - const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, const std::complex **b, - int64_t *ldb, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, const std::complex **b, - int64_t *ldb, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a, - int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta, - sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda, - const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a, - const float *b, int64_t ldb, int64_t stride_b, float beta, float *c, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, float* alpha, const float** a, + int64_t* lda, float** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, double* alpha, const double** a, + int64_t* lda, double** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const float** a, int64_t* lda, + const float** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, double* alpha, const double** a, int64_t* lda, + const double** b, int64_t* ldb, double* beta, double** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, sycl::half* alpha, const sycl::half** a, + int64_t* lda, const sycl::half** b, int64_t* ldb, sycl::half* beta, + sycl::half** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const sycl::half** a, int64_t* lda, + const sycl::half** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, std::int32_t** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, int64_t stride_a, + const float* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a, - const double *b, int64_t ldb, int64_t stride_b, double beta, double *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, int64_t stride_a, + const double* b, int64_t ldb, int64_t stride_b, double beta, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, const std::complex *b, int64_t ldb, - int64_t stride_b, std::complex beta, std::complex *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, const std::complex *b, int64_t ldb, - int64_t stride_b, std::complex beta, std::complex *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, - int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, + int64_t stride_a, const sycl::half* b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::half* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a, - const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, int64_t stride_a, + const sycl::half* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, - std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, - int64_t lda, std::int8_t ao, const std::uint8_t *b, int64_t ldb, - std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t *a, - int64_t lda, std::int8_t ao, const std::int8_t *b, int64_t ldb, - std::int8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a, - int64_t lda, std::uint8_t ao, const std::int8_t *b, int64_t ldb, - std::int8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t *a, - int64_t lda, std::uint8_t ao, const std::uint8_t *b, int64_t ldb, - std::uint8_t bo, float beta, std::int32_t *c, int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, + std::int32_t* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t* a, + int64_t lda, std::int8_t ao, const std::uint8_t* b, int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const std::int8_t* a, + int64_t lda, std::int8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t* a, + int64_t lda, std::uint8_t ao, const std::int8_t* b, int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); + +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const std::uint8_t* a, + int64_t lda, std::uint8_t ao, const std::uint8_t* b, int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies = {}); +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies = {}); + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, float *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, float beta, const float *b, - int64_t ldb, float *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, double beta, const double *b, - int64_t ldb, double *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, float* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, std::int64_t stridea, float* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, std::int64_t stridea, double* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, float beta, const float* b, + int64_t ldb, float* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, double beta, const double* b, + int64_t ldb, double* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies = {}); sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, diff --git a/include/oneapi/mkl/detail/exceptions.hpp b/include/oneapi/mkl/detail/exceptions.hpp index 7767c2ac3..18eeca5b1 100644 --- a/include/oneapi/mkl/detail/exceptions.hpp +++ b/include/oneapi/mkl/detail/exceptions.hpp @@ -31,14 +31,14 @@ namespace mkl { class backend_not_found : public oneapi::mkl::exception { public: - backend_not_found(const std::string &info = "") + backend_not_found(const std::string& info = "") : oneapi::mkl::exception( "", "", ((info.length() != 0) ? info : "Couldn't load selected backend")) {} }; class function_not_found : public oneapi::mkl::exception { public: - function_not_found(const std::string &info = "") + function_not_found(const std::string& info = "") : oneapi::mkl::exception( "", "", ((info.length() != 0) ? info : "Couldn't load functions from selected backend")) { @@ -47,7 +47,7 @@ class function_not_found : public oneapi::mkl::exception { class specification_mismatch : public oneapi::mkl::exception { public: - specification_mismatch(const std::string &info = "") + specification_mismatch(const std::string& info = "") : oneapi::mkl::exception( "", "", ((info.length() != 0) ? info : "Loaded oneMKL specification version mismatch")) {} diff --git a/include/oneapi/mkl/detail/get_device_id.hpp b/include/oneapi/mkl/detail/get_device_id.hpp index 88b235754..dc3c28402 100644 --- a/include/oneapi/mkl/detail/get_device_id.hpp +++ b/include/oneapi/mkl/detail/get_device_id.hpp @@ -40,7 +40,7 @@ namespace oneapi { namespace mkl { -inline oneapi::mkl::device get_device_id(sycl::queue &queue) { +inline oneapi::mkl::device get_device_id(sycl::queue& queue) { oneapi::mkl::device device_id; if (queue.get_device().is_cpu()) device_id = device::x86cpu; diff --git a/include/oneapi/mkl/dft/backward.hpp b/include/oneapi/mkl/dft/backward.hpp index 3cd03e13b..becca85d0 100644 --- a/include/oneapi/mkl/dft/backward.hpp +++ b/include/oneapi/mkl/dft/backward.hpp @@ -33,7 +33,7 @@ namespace oneapi::mkl::dft { //In-place transform template -void compute_backward(descriptor_type &desc, sycl::buffer &inout) { +void compute_backward(descriptor_type& desc, sycl::buffer& inout) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); @@ -46,8 +46,8 @@ void compute_backward(descriptor_type &desc, sycl::buffer &inout) //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template , bool> = true> -void compute_backward(descriptor_type &desc, sycl::buffer &inout_re, - sycl::buffer &inout_im) { +void compute_backward(descriptor_type& desc, sycl::buffer& inout_re, + sycl::buffer& inout_im) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); @@ -61,8 +61,8 @@ void compute_backward(descriptor_type &desc, sycl::buffer &inout_r //Out-of-place transform template -void compute_backward(descriptor_type &desc, sycl::buffer &in, - sycl::buffer &out) { +void compute_backward(descriptor_type& desc, sycl::buffer& in, + sycl::buffer& out) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, @@ -79,9 +79,9 @@ void compute_backward(descriptor_type &desc, sycl::buffer &in, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -void compute_backward(descriptor_type &desc, sycl::buffer &in_re, - sycl::buffer &in_im, sycl::buffer &out_re, - sycl::buffer &out_im) { +void compute_backward(descriptor_type& desc, sycl::buffer& in_re, + sycl::buffer& in_im, sycl::buffer& out_re, + sycl::buffer& out_im) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, @@ -104,34 +104,32 @@ void compute_backward(descriptor_type &desc, sycl::buffer &in_re, //In-place transform template -sycl::event compute_backward(descriptor_type &desc, data_type *inout, - const std::vector &dependencies = {}) { +sycl::event compute_backward(descriptor_type& desc, data_type* inout, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); using fwd_type = typename detail::descriptor_info::forward_type; - return get_commit(desc)->backward_ip_cc(desc, reinterpret_cast(inout), - dependencies); + return get_commit(desc)->backward_ip_cc(desc, reinterpret_cast(inout), dependencies); } //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template , bool> = true> -sycl::event compute_backward(descriptor_type &desc, data_type *inout_re, data_type *inout_im, - const std::vector &dependencies = {}) { +sycl::event compute_backward(descriptor_type& desc, data_type* inout_re, data_type* inout_im, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); using scalar_type = typename detail::descriptor_info::scalar_type; - return get_commit(desc)->backward_ip_rr(desc, reinterpret_cast(inout_re), - reinterpret_cast(inout_im), - dependencies); + return get_commit(desc)->backward_ip_rr(desc, reinterpret_cast(inout_re), + reinterpret_cast(inout_im), dependencies); } //Out-of-place transform template -sycl::event compute_backward(descriptor_type &desc, input_type *in, output_type *out, - const std::vector &dependencies = {}) { +sycl::event compute_backward(descriptor_type& desc, input_type* in, output_type* out, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, @@ -139,25 +137,25 @@ sycl::event compute_backward(descriptor_type &desc, input_type *in, output_type using fwd_type = typename detail::descriptor_info::forward_type; using bwd_type = typename detail::descriptor_info::backward_type; - return get_commit(desc)->backward_op_cc(desc, reinterpret_cast(in), - reinterpret_cast(out), dependencies); + return get_commit(desc)->backward_op_cc(desc, reinterpret_cast(in), + reinterpret_cast(out), dependencies); } //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -sycl::event compute_backward(descriptor_type &desc, input_type *in_re, input_type *in_im, - output_type *out_re, output_type *out_im, - const std::vector &dependencies = {}) { +sycl::event compute_backward(descriptor_type& desc, input_type* in_re, input_type* in_im, + output_type* out_re, output_type* out_im, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, "unexpected type for output_type"); using scalar_type = typename detail::descriptor_info::scalar_type; - return get_commit(desc)->backward_op_rr(desc, reinterpret_cast(in_re), - reinterpret_cast(in_im), - reinterpret_cast(out_re), - reinterpret_cast(out_im), dependencies); + return get_commit(desc)->backward_op_rr(desc, reinterpret_cast(in_re), + reinterpret_cast(in_im), + reinterpret_cast(out_re), + reinterpret_cast(out_im), dependencies); } } // namespace oneapi::mkl::dft diff --git a/include/oneapi/mkl/dft/detail/commit_impl.hpp b/include/oneapi/mkl/dft/detail/commit_impl.hpp index 9e827f357..0c1a1e0b2 100644 --- a/include/oneapi/mkl/dft/detail/commit_impl.hpp +++ b/include/oneapi/mkl/dft/detail/commit_impl.hpp @@ -54,18 +54,18 @@ class commit_impl { public: commit_impl(sycl::queue queue, mkl::backend backend, - const dft::detail::dft_values &config_values) + const dft::detail::dft_values& config_values) : queue_(queue), backend_(backend), external_workspace_helper_(config_values.workspace_placement == dft::detail::config_value::WORKSPACE_EXTERNAL) {} // rule of three - commit_impl(const commit_impl &other) = delete; - commit_impl &operator=(const commit_impl &other) = delete; + commit_impl(const commit_impl& other) = delete; + commit_impl& operator=(const commit_impl& other) = delete; virtual ~commit_impl() = default; - sycl::queue &get_queue() noexcept { + sycl::queue& get_queue() noexcept { return queue_; } @@ -73,9 +73,9 @@ class commit_impl { return backend_; } - virtual void *get_handle() noexcept = 0; + virtual void* get_handle() noexcept = 0; - virtual void commit(const dft_values &) = 0; + virtual void commit(const dft_values&) = 0; inline std::int64_t get_workspace_external_bytes() { return external_workspace_helper_.get_rqd_workspace_bytes(*this); @@ -87,54 +87,54 @@ class commit_impl { // When not overridden, external workspace support is faked: an external workspace can be set, // and errors will be generated according to the specificiation, // but the required workspace size will always be zero, and any given workspace will not actually be used. - virtual void set_workspace(scalar_type *usm_workspace) { + virtual void set_workspace(scalar_type* usm_workspace) { external_workspace_helper_.set_workspace_throw(*this, usm_workspace); } - virtual void set_workspace(sycl::buffer &buffer_workspace) { + virtual void set_workspace(sycl::buffer& buffer_workspace) { external_workspace_helper_.set_workspace_throw(*this, buffer_workspace); } - virtual void forward_ip_cc(descriptor_type &desc, sycl::buffer &inout) = 0; - virtual void forward_ip_rr(descriptor_type &desc, sycl::buffer &inout_re, - sycl::buffer &inout_im) = 0; - virtual void forward_op_cc(descriptor_type &desc, sycl::buffer &in, - sycl::buffer &out) = 0; - virtual void forward_op_rr(descriptor_type &desc, sycl::buffer &in_re, - sycl::buffer &in_im, - sycl::buffer &out_re, - sycl::buffer &out_im) = 0; - - virtual sycl::event forward_ip_cc(descriptor_type &desc, fwd_type *inout, - const std::vector &dependencies) = 0; - virtual sycl::event forward_ip_rr(descriptor_type &desc, scalar_type *inout_re, - scalar_type *inout_im, - const std::vector &dependencies) = 0; - virtual sycl::event forward_op_cc(descriptor_type &desc, fwd_type *in, bwd_type *out, - const std::vector &dependencies) = 0; - virtual sycl::event forward_op_rr(descriptor_type &desc, scalar_type *in_re, scalar_type *in_im, - scalar_type *out_re, scalar_type *out_im, - const std::vector &dependencies) = 0; - - virtual void backward_ip_cc(descriptor_type &desc, sycl::buffer &inout) = 0; - virtual void backward_ip_rr(descriptor_type &desc, sycl::buffer &inout_re, - sycl::buffer &inout_im) = 0; - virtual void backward_op_cc(descriptor_type &desc, sycl::buffer &in, - sycl::buffer &out) = 0; - virtual void backward_op_rr(descriptor_type &desc, sycl::buffer &in_re, - sycl::buffer &in_im, - sycl::buffer &out_re, - sycl::buffer &out_im) = 0; - - virtual sycl::event backward_ip_cc(descriptor_type &desc, fwd_type *inout, - const std::vector &dependencies) = 0; - virtual sycl::event backward_ip_rr(descriptor_type &desc, scalar_type *inout_re, - scalar_type *inout_im, - const std::vector &dependencies) = 0; - virtual sycl::event backward_op_cc(descriptor_type &desc, bwd_type *in, fwd_type *out, - const std::vector &dependencies) = 0; - virtual sycl::event backward_op_rr(descriptor_type &desc, scalar_type *in_re, - scalar_type *in_im, scalar_type *out_re, scalar_type *out_im, - const std::vector &dependencies) = 0; + virtual void forward_ip_cc(descriptor_type& desc, sycl::buffer& inout) = 0; + virtual void forward_ip_rr(descriptor_type& desc, sycl::buffer& inout_re, + sycl::buffer& inout_im) = 0; + virtual void forward_op_cc(descriptor_type& desc, sycl::buffer& in, + sycl::buffer& out) = 0; + virtual void forward_op_rr(descriptor_type& desc, sycl::buffer& in_re, + sycl::buffer& in_im, + sycl::buffer& out_re, + sycl::buffer& out_im) = 0; + + virtual sycl::event forward_ip_cc(descriptor_type& desc, fwd_type* inout, + const std::vector& dependencies) = 0; + virtual sycl::event forward_ip_rr(descriptor_type& desc, scalar_type* inout_re, + scalar_type* inout_im, + const std::vector& dependencies) = 0; + virtual sycl::event forward_op_cc(descriptor_type& desc, fwd_type* in, bwd_type* out, + const std::vector& dependencies) = 0; + virtual sycl::event forward_op_rr(descriptor_type& desc, scalar_type* in_re, scalar_type* in_im, + scalar_type* out_re, scalar_type* out_im, + const std::vector& dependencies) = 0; + + virtual void backward_ip_cc(descriptor_type& desc, sycl::buffer& inout) = 0; + virtual void backward_ip_rr(descriptor_type& desc, sycl::buffer& inout_re, + sycl::buffer& inout_im) = 0; + virtual void backward_op_cc(descriptor_type& desc, sycl::buffer& in, + sycl::buffer& out) = 0; + virtual void backward_op_rr(descriptor_type& desc, sycl::buffer& in_re, + sycl::buffer& in_im, + sycl::buffer& out_re, + sycl::buffer& out_im) = 0; + + virtual sycl::event backward_ip_cc(descriptor_type& desc, fwd_type* inout, + const std::vector& dependencies) = 0; + virtual sycl::event backward_ip_rr(descriptor_type& desc, scalar_type* inout_re, + scalar_type* inout_im, + const std::vector& dependencies) = 0; + virtual sycl::event backward_op_cc(descriptor_type& desc, bwd_type* in, fwd_type* out, + const std::vector& dependencies) = 0; + virtual sycl::event backward_op_rr(descriptor_type& desc, scalar_type* in_re, + scalar_type* in_im, scalar_type* out_re, scalar_type* out_im, + const std::vector& dependencies) = 0; /** For compute calls, throw errors for the external workspace as required. * @tparam ArgTs The non-descriptor arg(s) for the compute call. First one is used to check @@ -142,7 +142,7 @@ class commit_impl { * @param function_name The function name to user in generated exceptions. */ template - void compute_call_throw(const char *function_name) { + void compute_call_throw(const char* function_name) { external_workspace_helper_.template compute_call_throw(function_name); } @@ -151,14 +151,14 @@ class commit_impl { * @param function_name The function name to user in generated exceptions. * @param cgh The command group handler to associate the accessor with. */ - void add_buffer_workspace_dependency_if_rqd(const char *function_name, sycl::handler &cgh) { + void add_buffer_workspace_dependency_if_rqd(const char* function_name, sycl::handler& cgh) { external_workspace_helper_.add_buffer_dependency_if_rqd(function_name, cgh); } /** If WORKSPACE_EXTERNAL is set, depend on the last USM workspace event added via set_last_usm_workspace_event. * @param cgh The command group handler to associate the accessor with. */ - void depend_on_last_usm_workspace_event_if_rqd(sycl::handler &cgh) { + void depend_on_last_usm_workspace_event_if_rqd(sycl::handler& cgh) { external_workspace_helper_.depend_on_last_usm_workspace_event_if_rqd(cgh); } @@ -166,7 +166,7 @@ class commit_impl { * subsequent calls to depend_on_last_usm_workspace_event. * @param sycl_event The last usage of the USM workspace. */ - void set_last_usm_workspace_event_if_rqd(sycl::event &sycl_event) { + void set_last_usm_workspace_event_if_rqd(sycl::event& sycl_event) { external_workspace_helper_.set_last_usm_workspace_event_if_rqd(sycl_event); } diff --git a/include/oneapi/mkl/dft/detail/dft_ct.hxx b/include/oneapi/mkl/dft/detail/dft_ct.hxx index 20cd537d8..7fc2921e4 100644 --- a/include/oneapi/mkl/dft/detail/dft_ct.hxx +++ b/include/oneapi/mkl/dft/detail/dft_ct.hxx @@ -20,8 +20,8 @@ // Commit template -ONEMKL_EXPORT dft::detail::commit_impl *create_commit( - const dft::detail::descriptor &desc, sycl::queue &sycl_queue); +ONEMKL_EXPORT dft::detail::commit_impl* create_commit( + const dft::detail::descriptor& desc, sycl::queue& sycl_queue); // BUFFER version @@ -34,105 +34,105 @@ using bwd = typename detail::descriptor_info::backward_type; //In-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout); +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout); //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im); +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im); //Out-of-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer, 1> &in, - sycl::buffer, 1> &out); +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, sycl::buffer, 1>& in, + sycl::buffer, 1>& out); //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im); +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im); //USM version //In-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies); //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& dependencies); //Out-of-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *in, - bwd *out, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* in, + bwd* out, + const std::vector& dependencies); //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& dependencies); // BUFFER version //In-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout); +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout); //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im); +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im); //Out-of-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in, - sycl::buffer, 1> &out); +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in, + sycl::buffer, 1>& out); //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im); +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im); //USM version //In-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies); //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& dependencies); //Out-of-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd *in, - fwd *out, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, bwd* in, + fwd* out, + const std::vector& dependencies); //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &dependencies); +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& dependencies); diff --git a/include/oneapi/mkl/dft/detail/types_impl.hpp b/include/oneapi/mkl/dft/detail/types_impl.hpp index 60eb922ab..5dad2302e 100644 --- a/include/oneapi/mkl/dft/detail/types_impl.hpp +++ b/include/oneapi/mkl/dft/detail/types_impl.hpp @@ -113,7 +113,7 @@ using valid_compute_arg = typename std::bool_constant< template constexpr bool valid_ip_realreal_impl = - is_complex_dft&& std::is_same_v, data_t>; + is_complex_dft && std::is_same_v, data_t>; // compute the range of a reinterpreted buffer template diff --git a/include/oneapi/mkl/dft/forward.hpp b/include/oneapi/mkl/dft/forward.hpp index e43c39ce0..0eeecd497 100644 --- a/include/oneapi/mkl/dft/forward.hpp +++ b/include/oneapi/mkl/dft/forward.hpp @@ -34,7 +34,7 @@ namespace oneapi::mkl::dft { //In-place transform template -void compute_forward(descriptor_type &desc, sycl::buffer &inout) { +void compute_forward(descriptor_type& desc, sycl::buffer& inout) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); @@ -47,8 +47,8 @@ void compute_forward(descriptor_type &desc, sycl::buffer &inout) { //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template , bool> = true> -void compute_forward(descriptor_type &desc, sycl::buffer &inout_re, - sycl::buffer &inout_im) { +void compute_forward(descriptor_type& desc, sycl::buffer& inout_re, + sycl::buffer& inout_im) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); @@ -62,8 +62,8 @@ void compute_forward(descriptor_type &desc, sycl::buffer &inout_re //Out-of-place transform template -void compute_forward(descriptor_type &desc, sycl::buffer &in, - sycl::buffer &out) { +void compute_forward(descriptor_type& desc, sycl::buffer& in, + sycl::buffer& out) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, @@ -80,9 +80,9 @@ void compute_forward(descriptor_type &desc, sycl::buffer &in, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -void compute_forward(descriptor_type &desc, sycl::buffer &in_re, - sycl::buffer &in_im, sycl::buffer &out_re, - sycl::buffer &out_im) { +void compute_forward(descriptor_type& desc, sycl::buffer& in_re, + sycl::buffer& in_im, sycl::buffer& out_re, + sycl::buffer& out_im) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, @@ -105,56 +105,56 @@ void compute_forward(descriptor_type &desc, sycl::buffer &in_re, //In-place transform template -sycl::event compute_forward(descriptor_type &desc, data_type *inout, - const std::vector &dependencies = {}) { +sycl::event compute_forward(descriptor_type& desc, data_type* inout, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); using fwd_type = typename detail::descriptor_info::forward_type; - return get_commit(desc)->forward_ip_cc(desc, reinterpret_cast(inout), dependencies); + return get_commit(desc)->forward_ip_cc(desc, reinterpret_cast(inout), dependencies); } //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template , bool> = true> -sycl::event compute_forward(descriptor_type &desc, data_type *inout_re, data_type *inout_im, - const std::vector &dependencies = {}) { +sycl::event compute_forward(descriptor_type& desc, data_type* inout_re, data_type* inout_im, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for data_type"); using scalar_type = typename detail::descriptor_info::scalar_type; - return get_commit(desc)->forward_ip_rr(desc, reinterpret_cast(inout_re), - reinterpret_cast(inout_im), dependencies); + return get_commit(desc)->forward_ip_rr(desc, reinterpret_cast(inout_re), + reinterpret_cast(inout_im), dependencies); } //Out-of-place transform template -sycl::event compute_forward(descriptor_type &desc, input_type *in, output_type *out, - const std::vector &dependencies = {}) { +sycl::event compute_forward(descriptor_type& desc, input_type* in, output_type* out, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, "unexpected type for output_type"); using fwd_type = typename detail::descriptor_info::forward_type; using bwd_type = typename detail::descriptor_info::backward_type; - return get_commit(desc)->forward_op_cc(desc, reinterpret_cast(in), - reinterpret_cast(out), dependencies); + return get_commit(desc)->forward_op_cc(desc, reinterpret_cast(in), + reinterpret_cast(out), dependencies); } //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -sycl::event compute_forward(descriptor_type &desc, input_type *in_re, input_type *in_im, - output_type *out_re, output_type *out_im, - const std::vector &dependencies = {}) { +sycl::event compute_forward(descriptor_type& desc, input_type* in_re, input_type* in_im, + output_type* out_re, output_type* out_im, + const std::vector& dependencies = {}) { static_assert(detail::valid_compute_arg::value, "unexpected type for input_type"); static_assert(detail::valid_compute_arg::value, "unexpected type for output_type"); using scalar_type = typename detail::descriptor_info::scalar_type; - return get_commit(desc)->forward_op_rr(desc, reinterpret_cast(in_re), - reinterpret_cast(in_im), - reinterpret_cast(out_re), - reinterpret_cast(out_im), dependencies); + return get_commit(desc)->forward_op_rr(desc, reinterpret_cast(in_re), + reinterpret_cast(in_im), + reinterpret_cast(out_re), + reinterpret_cast(out_im), dependencies); } } // namespace oneapi::mkl::dft diff --git a/include/oneapi/mkl/exceptions.hpp b/include/oneapi/mkl/exceptions.hpp index 244c8c61d..8047f7676 100644 --- a/include/oneapi/mkl/exceptions.hpp +++ b/include/oneapi/mkl/exceptions.hpp @@ -38,7 +38,7 @@ class exception : public std::exception { std::string msg_; public: - exception(const std::string &domain, const std::string &function, const std::string &info = "") + exception(const std::string& domain, const std::string& function, const std::string& info = "") : std::exception() { msg_ = std::string("oneMKL: ") + domain + ((domain.length() != 0 && function.length() != 0) ? "/" : "") + function + @@ -47,15 +47,15 @@ class exception : public std::exception { : ""); } - const char *what() const noexcept override { + const char* what() const noexcept override { return msg_.c_str(); } }; class unsupported_device : public oneapi::mkl::exception { public: - unsupported_device(const std::string &domain, const std::string &function, - const sycl::device &device) + unsupported_device(const std::string& domain, const std::string& function, + const sycl::device& device) : oneapi::mkl::exception( domain, function, device.get_info() + " is not supported") {} @@ -63,14 +63,14 @@ class unsupported_device : public oneapi::mkl::exception { class host_bad_alloc : public oneapi::mkl::exception { public: - host_bad_alloc(const std::string &domain, const std::string &function) + host_bad_alloc(const std::string& domain, const std::string& function) : oneapi::mkl::exception(domain, function, "cannot allocate memory on host") {} }; class device_bad_alloc : public oneapi::mkl::exception { public: - device_bad_alloc(const std::string &domain, const std::string &function, - const sycl::device &device) + device_bad_alloc(const std::string& domain, const std::string& function, + const sycl::device& device) : oneapi::mkl::exception( domain, function, "cannot allocate memory on " + device.get_info()) {} @@ -78,30 +78,30 @@ class device_bad_alloc : public oneapi::mkl::exception { class unimplemented : public oneapi::mkl::exception { public: - unimplemented(const std::string &domain, const std::string &function, - const std::string &info = "") + unimplemented(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception(domain, function, "function is not implemented " + info) {} }; class invalid_argument : public oneapi::mkl::exception { public: - invalid_argument(const std::string &domain, const std::string &function, - const std::string &info = "") + invalid_argument(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception(domain, function, "invalid argument " + info) {} }; class uninitialized : public oneapi::mkl::exception { public: - uninitialized(const std::string &domain, const std::string &function, - const std::string &info = "") + uninitialized(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception(domain, function, "handle/descriptor is not initialized " + info) {} }; class computation_error : public oneapi::mkl::exception { public: - computation_error(const std::string &domain, const std::string &function, - const std::string &info = "") + computation_error(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception( domain, function, "computation error" + ((info.length() != 0) ? (": " + info) : "")) {} @@ -109,16 +109,16 @@ class computation_error : public oneapi::mkl::exception { class batch_error : public oneapi::mkl::exception { public: - batch_error(const std::string &domain, const std::string &function, - const std::string &info = "") + batch_error(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception(domain, function, "batch error" + ((info.length() != 0) ? (": " + info) : "")) {} }; class library_not_found : public oneapi::mkl::exception { public: - library_not_found(const std::string &domain, const std::string &function, - const std::string &info = "") + library_not_found(const std::string& domain, const std::string& function, + const std::string& info = "") : oneapi::mkl::exception( domain, function, "library not found" + ((info.length() != 0) ? (": " + info) : "")) {} diff --git a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx index cd1d76765..0b1d58ba1 100644 --- a/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx +++ b/include/oneapi/mkl/lapack/detail/cusolver/lapack_ct.hxx @@ -20,760 +20,760 @@ // Buffer APIs static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, - std::int64_t ldvt, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, + std::int64_t ldvt, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, sycl::buffer &scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, @@ -781,10 +781,10 @@ static inline void getri_batch(backend_selector selector, std } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -792,10 +792,10 @@ static inline void getrs_batch(backend_selector selector, } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -803,11 +803,11 @@ static inline void getrs_batch(backend_selector selector, } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -815,126 +815,126 @@ static inline void getrs_batch(backend_selector selector, } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, @@ -942,31 +942,31 @@ static inline void potrs_batch(backend_selector selector, one } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void ungqr_batch(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); @@ -975,350 +975,350 @@ static inline void ungqr_batch(backend_selector selector, std // USM APIs static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *d, - float *e, std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, float* d, + float* e, std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *d, double *e, - double *tauq, double *taup, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* d, double* e, + double* tauq, double* taup, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *d, float *e, - float *tauq, float *taup, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* d, float* e, + float* tauq, float* taup, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, std::int64_t* ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *w, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *w, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, float *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, double *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1326,233 +1326,233 @@ static inline sycl::event ormtr(backend_selector selector, static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *w, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* w, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *w, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* w, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* b, std::int64_t ldb, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tau, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tau, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); @@ -1560,19 +1560,19 @@ static inline sycl::event trtrs(backend_selector selector, static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); @@ -1580,71 +1580,71 @@ static inline sycl::event trtrs(backend_selector selector, static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1652,11 +1652,11 @@ static inline sycl::event unmrq(backend_selector selector, static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1664,10 +1664,10 @@ static inline sycl::event unmrq(backend_selector selector, static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1675,11 +1675,11 @@ static inline sycl::event unmqr(backend_selector selector, static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1687,10 +1687,10 @@ static inline sycl::event unmqr(backend_selector selector, static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1698,592 +1698,592 @@ static inline sycl::event unmtr(backend_selector selector, static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event getrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event getrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getrf_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri_batch(backend_selector selector, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri_batch(backend_selector selector, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getri_batch(backend_selector selector, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getri_batch(backend_selector selector, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, - std::int64_t **ipiv, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, + std::int64_t** ipiv, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, - std::int64_t **ipiv, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, + std::int64_t** ipiv, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( - backend_selector selector, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + backend_selector selector, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::int64_t* k, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, +static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::cusolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); @@ -2566,62 +2566,62 @@ std::int64_t ungqr_batch_scratchpad_size(backend_selector sel } template std::int64_t getrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::getrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template std::int64_t getri_batch_scratchpad_size(backend_selector selector, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::getri_batch_scratchpad_size( selector.get_queue(), n, lda, group_count, group_sizes); } template std::int64_t getrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::getrs_batch_scratchpad_size( selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes); } template std::int64_t geqrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::geqrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template std::int64_t orgqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::orgqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } template std::int64_t potrf_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::potrf_batch_scratchpad_size( selector.get_queue(), uplo, n, lda, group_count, group_sizes); } template std::int64_t potrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::potrs_batch_scratchpad_size( selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template std::int64_t ungqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::cusolver::ungqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } diff --git a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx b/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx index ffa9c3007..f0de843fe 100644 --- a/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx +++ b/include/oneapi/mkl/lapack/detail/cusolver/onemkl_lapack_cusolver.hxx @@ -19,1812 +19,1812 @@ // Buffer APIs -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); // USM APIs -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tauq, double *taup, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tauq, double* taup, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, - double *b, std::int64_t ldb, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, - float *b, std::int64_t ldb, float *scratchpad, +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu, - double *vt, std::int64_t ldvt, double *scratchpad, + double* a, std::int64_t lda, double* s, double* u, std::int64_t ldu, + double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt, - std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* s, float* u, std::int64_t ldu, float* vt, + std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, double* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - double *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - float *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - double *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *w, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *w, float *scratchpad, +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *scratchpad, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *scratchpad, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); // SCRATCHPAD APIs template -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); diff --git a/include/oneapi/mkl/lapack/detail/lapack_loader.hpp b/include/oneapi/mkl/lapack/detail/lapack_loader.hpp index 2bb49364e..fc5b3d70e 100644 --- a/include/oneapi/mkl/lapack/detail/lapack_loader.hpp +++ b/include/oneapi/mkl/lapack/detail/lapack_loader.hpp @@ -38,2344 +38,2344 @@ namespace mkl { namespace lapack { namespace detail { -ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, - std::int64_t ldvt, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, + std::int64_t ldvt, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT void heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT void hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, +ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, +ONEMKL_EXPORT void ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT void unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT void unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT void unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, +ONEMKL_EXPORT void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer> &a, +ONEMKL_EXPORT void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, +ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +ONEMKL_EXPORT void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, +ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT void ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *d, - float *e, std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *d, double *e, - double *tauq, double *taup, double *scratchpad, +ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, float* d, + float* e, std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, double* d, double* e, + double* tauq, double* taup, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *d, float *e, - float *tauq, float *taup, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, float* d, float* e, + float* tauq, float* taup, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, std::int64_t* ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, + double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, + float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + std::complex* a, std::int64_t lda, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, float *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, double *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t k, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *w, double *scratchpad, + double* a, std::int64_t lda, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *w, float *scratchpad, + float* a, std::int64_t lda, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, + double* a, std::int64_t lda, double* b, std::int64_t ldb, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tau, double *scratchpad, + float* a, std::int64_t lda, float* b, std::int64_t ldb, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tau, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex *a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex *a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event getrs_batch( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event getrs_batch( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, - std::int64_t **ipiv, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, - std::int64_t **ipiv, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, + std::int64_t** ipiv, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, + std::int64_t** ipiv, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event getrs_batch( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); ONEMKL_EXPORT sycl::event ungqr_batch( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, std::complex *tau, - std::int64_t stride_tau, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex* tau, + std::int64_t stride_tau, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); template = nullptr> -std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, +std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t heevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t heevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hegvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t hegvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t hetrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t hetrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hetrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t hetrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t ungbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ungbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, +std::int64_t ungqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ungtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t unmrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t unmrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t unmqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t unmtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t ungqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::job jobz, + sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::job jobz, + sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t itype, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t itype, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, + sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, + sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes); + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes); } //namespace detail } //namespace lapack } //namespace mkl diff --git a/include/oneapi/mkl/lapack/detail/lapack_rt.hpp b/include/oneapi/mkl/lapack/detail/lapack_rt.hpp index a96efe8d1..5199a0ce5 100644 --- a/include/oneapi/mkl/lapack/detail/lapack_rt.hpp +++ b/include/oneapi/mkl/lapack/detail/lapack_rt.hpp @@ -38,2132 +38,2132 @@ namespace oneapi { namespace mkl { namespace lapack { -static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +static inline void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, +static inline void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -static inline void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +static inline void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +static inline void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +static inline void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, +static inline void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -static inline void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +static inline void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +static inline void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -static inline void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +static inline void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -static inline void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +static inline void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -static inline void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +static inline void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -static inline void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +static inline void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -static inline void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +static inline void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -static inline void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -static inline void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -static inline void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +static inline void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +static inline void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +static inline void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +static inline void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +static inline void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +static inline void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +static inline void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +static inline void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +static inline void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +static inline void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +static inline void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +static inline void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +static inline void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +static inline void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +static inline void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +static inline void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -static inline void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +static inline void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -static inline void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +static inline void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -static inline void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +static inline void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -static inline void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -static inline void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +static inline void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +static inline void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +static inline void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +static inline void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +static inline void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +static inline void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -static inline void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +static inline void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -static inline void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +static inline void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +static inline void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +static inline void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +static inline void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +static inline void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +static inline void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +static inline void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +static inline void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +static inline void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +static inline void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +static inline void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +static inline void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +static inline void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +static inline void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +static inline void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +static inline void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -static inline void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +static inline void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +static inline void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +static inline void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +static inline void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +static inline void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +static inline void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +static inline void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -static inline void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +static inline void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +static inline void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tauq, double *taup, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tauq, double* taup, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gebrd(get_device_id(queue), queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +static inline sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +static inline sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gerqf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +static inline sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +static inline sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf(get_device_id(queue), queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +static inline sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +static inline sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf(get_device_id(queue), queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, +static inline sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri(get_device_id(queue), queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, - double *b, std::int64_t ldb, double *scratchpad, +static inline sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, - float *b, std::int64_t ldb, float *scratchpad, +static inline sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrs(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +static inline sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu, - double *vt, std::int64_t ldvt, double *scratchpad, + double* a, std::int64_t lda, double* s, double* u, std::int64_t ldu, + double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt, - std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* s, float* u, std::int64_t ldu, float* vt, + std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +static inline sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +static inline sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, double* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::gesvd(get_device_id(queue), queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - double *w, std::complex *scratchpad, +static inline sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::heevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - float *w, std::complex *scratchpad, +static inline sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - double *w, std::complex *scratchpad, +static inline sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::hegvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::hetrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::hetrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::orgbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *scratchpad, +static inline sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *scratchpad, +static inline sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, +static inline sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, +static inline sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ormqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +static inline sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +static inline sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrf(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +static inline sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +static inline sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potri(get_device_id(queue), queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::syevd(get_device_id(queue), queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *w, double *scratchpad, +static inline sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *w, float *scratchpad, +static inline sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::sygvd(get_device_id(queue), queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::sytrd(get_device_id(queue), queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +static inline sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +static inline sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::sytrf(get_device_id(queue), queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +static inline sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +static inline sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *scratchpad, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +static inline sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *scratchpad, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +static inline sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::trtrs(get_device_id(queue), queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ungbr(get_device_id(queue), queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::ungqr(get_device_id(queue), queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +static inline sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::ungtr(get_device_id(queue), queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::unmrq(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +static inline sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::unmqr(get_device_id(queue), queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +static inline sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::unmtr(get_device_id(queue), queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::geqrf_batch(get_device_id(queue), queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrf_batch(get_device_id(queue), queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, +static inline sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getri_batch(get_device_id(queue), queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, +static inline sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::getrs_batch(get_device_id(queue), queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, +static inline sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, +static inline sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::orgqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, +static inline sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrf_batch(get_device_id(queue), queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::potrs_batch(get_device_id(queue), queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +static inline sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +static inline sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -static inline sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return detail::ungqr_batch(get_device_id(queue), queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } template = nullptr> -std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return detail::gebrd_scratchpad_size(get_device_id(queue), queue, m, n, lda); } template = nullptr> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return detail::gerqf_scratchpad_size(get_device_id(queue), queue, m, n, lda); } template = nullptr> -std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return detail::geqrf_scratchpad_size(get_device_id(queue), queue, m, n, lda); } template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { return detail::gesvd_scratchpad_size(get_device_id(queue), queue, jobu, jobvt, m, n, lda, ldu, ldvt); } template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { return detail::gesvd_scratchpad_size(get_device_id(queue), queue, jobu, jobvt, m, n, lda, ldu, ldvt); } template = nullptr> -std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return detail::getrf_scratchpad_size(get_device_id(queue), queue, m, n, lda); } template = nullptr> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) { +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda) { return detail::getri_scratchpad_size(get_device_id(queue), queue, n, lda); } template = nullptr> -std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, +std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return detail::getrs_scratchpad_size(get_device_id(queue), queue, trans, n, nrhs, lda, ldb); } template = nullptr> -std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t heevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::heevd_scratchpad_size(get_device_id(queue), queue, jobz, uplo, n, lda); } template = nullptr> -std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t hegvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { return detail::hegvd_scratchpad_size(get_device_id(queue), queue, itype, jobz, uplo, n, lda, ldb); } template = nullptr> -std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::hetrd_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::hetrf_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return detail::orgbr_scratchpad_size(get_device_id(queue), queue, vect, m, n, k, lda); } template = nullptr> -std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::orgtr_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return detail::orgqr_scratchpad_size(get_device_id(queue), queue, m, n, k, lda); } template = nullptr> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { return detail::ormrq_scratchpad_size(get_device_id(queue), queue, side, trans, m, n, k, lda, ldc); } template = nullptr> -std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { return detail::ormqr_scratchpad_size(get_device_id(queue), queue, side, trans, m, n, k, @@ -2171,7 +2171,7 @@ std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, } template = nullptr> -std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2179,45 +2179,45 @@ std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, n, lda, ldc); } template = nullptr> -std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::potrf_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return detail::potrs_scratchpad_size(get_device_id(queue), queue, uplo, n, nrhs, lda, ldb); } template = nullptr> -std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::potri_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::sytrf_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::syevd_scratchpad_size(get_device_id(queue), queue, jobz, uplo, n, lda); } template = nullptr> -std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { return detail::sygvd_scratchpad_size(get_device_id(queue), queue, itype, jobz, uplo, n, lda, ldb); } template = nullptr> -std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::sytrd_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2226,31 +2226,31 @@ std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, } template = nullptr> -std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t ungbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return detail::ungbr_scratchpad_size(get_device_id(queue), queue, vect, m, n, k, lda); } template = nullptr> -std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return detail::ungqr_scratchpad_size(get_device_id(queue), queue, m, n, k, lda); } template = nullptr> -std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t ungtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return detail::ungtr_scratchpad_size(get_device_id(queue), queue, uplo, n, lda); } template = nullptr> -std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { return detail::unmrq_scratchpad_size(get_device_id(queue), queue, side, trans, m, n, k, lda, ldc); } template = nullptr> -std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { return detail::unmqr_scratchpad_size(get_device_id(queue), queue, side, trans, m, n, k, @@ -2258,7 +2258,7 @@ std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, } template = nullptr> -std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2266,21 +2266,21 @@ std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, n, lda, ldc); } template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return detail::getrf_batch_scratchpad_size(get_device_id(queue), queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return detail::getri_batch_scratchpad_size(get_device_id(queue), queue, n, lda, stride_a, stride_ipiv, batch_size); } template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, @@ -2290,21 +2290,21 @@ std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transp batch_size); } template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return detail::geqrf_batch_scratchpad_size(get_device_id(queue), queue, m, n, lda, stride_a, stride_tau, batch_size); } template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { return detail::potrf_batch_scratchpad_size(get_device_id(queue), queue, uplo, n, lda, stride_a, batch_size); } template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { @@ -2313,7 +2313,7 @@ std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo u } template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return detail::orgqr_batch_scratchpad_size(get_device_id(queue), queue, m, n, k, lda, @@ -2321,68 +2321,68 @@ std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std } template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return detail::ungqr_batch_scratchpad_size(get_device_id(queue), queue, m, n, k, lda, stride_a, stride_tau, batch_size); } template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return detail::getrf_batch_scratchpad_size(get_device_id(queue), queue, m, n, lda, group_count, group_sizes); } template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return detail::getri_batch_scratchpad_size(get_device_id(queue), queue, n, lda, group_count, group_sizes); } template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return detail::getrs_batch_scratchpad_size(get_device_id(queue), queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return detail::geqrf_batch_scratchpad_size(get_device_id(queue), queue, m, n, lda, group_count, group_sizes); } template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return detail::orgqr_batch_scratchpad_size(get_device_id(queue), queue, m, n, k, lda, group_count, group_sizes); } template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return detail::potrf_batch_scratchpad_size(get_device_id(queue), queue, uplo, n, lda, group_count, group_sizes); } template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return detail::potrs_batch_scratchpad_size(get_device_id(queue), queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return detail::ungqr_batch_scratchpad_size(get_device_id(queue), queue, m, n, k, lda, group_count, group_sizes); } diff --git a/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx index 1ebe97527..f952eb3fd 100644 --- a/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx +++ b/include/oneapi/mkl/lapack/detail/mkl_common/lapack_ct.hxx @@ -18,765 +18,765 @@ *******************************************************************************/ static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, - std::int64_t ldvt, sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, + std::int64_t ldvt, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, sycl::buffer &scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, @@ -784,10 +784,10 @@ static inline void getri_batch(backend_selector selecto } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -795,10 +795,10 @@ static inline void getrs_batch(backend_selector selecto } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -806,11 +806,11 @@ static inline void getrs_batch(backend_selector selecto } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, @@ -818,111 +818,111 @@ static inline void getrs_batch(backend_selector selecto } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch( selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch( selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch( selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch( selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, @@ -930,9 +930,9 @@ static inline void potrs_batch(backend_selector selecto } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, @@ -940,10 +940,10 @@ static inline void potrs_batch(backend_selector selecto } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, @@ -951,21 +951,21 @@ static inline void potrs_batch(backend_selector selecto } static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } static inline void ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, @@ -973,370 +973,370 @@ static inline void ungqr_batch(backend_selector selecto } static inline void ungqr_batch(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *d, - float *e, std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, float* d, + float* e, std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *d, double *e, - double *tauq, double *taup, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* d, double* e, + double* tauq, double* taup, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *d, float *e, - float *tauq, float *taup, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* d, float* e, + float* tauq, float* taup, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, std::int64_t* ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgbr( selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgbr( selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1344,243 +1344,243 @@ static inline sycl::event ormtr(backend_selector select static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *w, double *scratchpad, + double* a, std::int64_t lda, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *w, float *scratchpad, + float* a, std::int64_t lda, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *w, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *w, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tau, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tau, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); @@ -1588,19 +1588,19 @@ static inline sycl::event trtrs(backend_selector select static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); @@ -1608,71 +1608,71 @@ static inline sycl::event trtrs(backend_selector select static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungbr( selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungbr( selector.get_queue(), vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1680,11 +1680,11 @@ static inline sycl::event unmrq(backend_selector select static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1692,10 +1692,10 @@ static inline sycl::event unmrq(backend_selector select static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1703,11 +1703,11 @@ static inline sycl::event unmqr(backend_selector select static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1715,10 +1715,10 @@ static inline sycl::event unmqr(backend_selector select static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); @@ -1726,607 +1726,607 @@ static inline sycl::event unmtr(backend_selector select static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch( selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch( selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::complex *a, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch( selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::complex *a, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch( selector.get_queue(), m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch( selector.get_queue(), m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch( selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch( selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch( selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch( selector.get_queue(), n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch(selector.get_queue(), n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, - std::int64_t **ipiv, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, + std::int64_t** ipiv, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, - std::int64_t **ipiv, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, + std::int64_t** ipiv, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( - backend_selector selector, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + backend_selector selector, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch( selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch( selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch( selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + std::int64_t* m, std::int64_t* n, std::int64_t* k, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch( selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch(selector.get_queue(), uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch( selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch( selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch( selector.get_queue(), m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch( selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch( selector.get_queue(), m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); @@ -2631,64 +2631,64 @@ std::int64_t ungqr_batch_scratchpad_size(backend_selector = nullptr> std::int64_t getrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template = nullptr> std::int64_t getri_batch_scratchpad_size(backend_selector selector, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::getri_batch_scratchpad_size( selector.get_queue(), n, lda, group_count, group_sizes); } template = nullptr> std::int64_t getrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::getrs_batch_scratchpad_size( selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes); } template = nullptr> std::int64_t geqrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::geqrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template = nullptr> std::int64_t orgqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::orgqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } template = nullptr> std::int64_t potrf_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrf_batch_scratchpad_size( selector.get_queue(), uplo, n, lda, group_count, group_sizes); } template = nullptr> std::int64_t potrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::potrs_batch_scratchpad_size( selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template = nullptr> std::int64_t ungqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::LAPACK_BACKEND::ungqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } diff --git a/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx b/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx index 372e2646b..bbc6079b7 100644 --- a/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx +++ b/include/oneapi/mkl/lapack/detail/mkl_common/onemkl_lapack_backends.hxx @@ -17,2123 +17,2123 @@ * SPDX-License-Identifier: Apache-2.0 *******************************************************************************/ -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tauq, double *taup, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tauq, double* taup, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, - double *b, std::int64_t ldb, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, - float *b, std::int64_t ldb, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu, - double *vt, std::int64_t ldvt, double *scratchpad, + double* a, std::int64_t lda, double* s, double* u, std::int64_t ldu, + double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt, - std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* s, float* u, std::int64_t ldu, float* vt, + std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, double* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - double *w, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - float *w, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - double *w, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *w, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *w, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *scratchpad, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *scratchpad, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); template = nullptr> -std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda); +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, +std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t heevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t hegvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t ungbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t ungtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> -ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template <> -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t getrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t heevd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size>( - sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size>( - sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t potrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> -ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template <> ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> -ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template <> ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template <> -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template <> -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* n, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *n, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* n, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, - oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, + oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, - oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, + oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template <> ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx b/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx index 774441409..1ba7533c1 100644 --- a/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx +++ b/include/oneapi/mkl/lapack/detail/rocsolver/lapack_ct.hxx @@ -22,2293 +22,2315 @@ // Buffer APIs static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, taup, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getrf(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void getri(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size); -} -static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, - scratchpad, scratchpad_size); -} -static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, - scratchpad, scratchpad_size); -} -static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, - std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, - scratchpad, scratchpad_size); -} -static inline void getrs(backend_selector selector, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, ldb, - scratchpad, scratchpad_size); + scratchpad_size); +} +static inline void getrs(backend_selector selector, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, + ldb, scratchpad, scratchpad_size); +} +static inline void getrs(backend_selector selector, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, + ldb, scratchpad, scratchpad_size); +} +static inline void getrs(backend_selector selector, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, + ldb, scratchpad, scratchpad_size); +} +static inline void getrs(backend_selector selector, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, b, + ldb, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, - vt, ldvt, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, + ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, - std::int64_t ldvt, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, + std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, - vt, ldvt, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, + ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, - vt, ldvt, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, + ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, - std::int64_t ldu, sycl::buffer> &vt, - std::int64_t ldvt, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, + std::int64_t ldu, sycl::buffer>& vt, + std::int64_t ldvt, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, ldu, - vt, ldvt, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, u, + ldu, vt, ldvt, scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, - scratchpad_size); + oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, + scratchpad, scratchpad_size); } static inline void heevd(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, - scratchpad_size); + oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, + scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, - w, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, + ldb, w, scratchpad, scratchpad_size); } static inline void hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, - w, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, + ldb, w, scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void hetrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void hetrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void orgbr(backend_selector selector, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void orgtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, - c, ldc, scratchpad, scratchpad_size); + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, + tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, - c, ldc, scratchpad, scratchpad_size); + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, + tau, c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potri(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void potrs(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, - scratchpad_size); + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, + scratchpad, scratchpad_size); } static inline void syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &w, sycl::buffer &scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, scratchpad, - scratchpad_size); + oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, + scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, - w, scratchpad, scratchpad_size); + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, + ldb, w, scratchpad, scratchpad_size); } static inline void sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, ldb, - w, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, b, + ldb, w, scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void sytrd(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void sytrf(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, - b, ldb, scratchpad, scratchpad_size); + b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, - b, ldb, scratchpad, scratchpad_size); + b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, - b, ldb, scratchpad, scratchpad_size); + b, ldb, scratchpad, scratchpad_size); } static inline void trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, lda, - b, ldb, scratchpad, scratchpad_size); + b, ldb, scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size); + scratchpad, scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void ungtr(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, scratchpad, - scratchpad_size); + scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, c, - ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, tau, + c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, - c, ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, + tau, c, ldc, scratchpad, scratchpad_size); } static inline void unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, tau, - c, ldc, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, lda, + tau, c, ldc, scratchpad, scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getri_batch(backend_selector selector, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - stride_a, ipiv, stride_ipiv, b, ldb, stride_b, - batch_size, scratchpad, scratchpad_size); + stride_a, ipiv, stride_ipiv, b, ldb, stride_b, + batch_size, scratchpad, scratchpad_size); } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - stride_a, ipiv, stride_ipiv, b, ldb, stride_b, - batch_size, scratchpad, scratchpad_size); + stride_a, ipiv, stride_ipiv, b, ldb, stride_b, + batch_size, scratchpad, scratchpad_size); } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - stride_a, ipiv, stride_ipiv, b, ldb, stride_b, - batch_size, scratchpad, scratchpad_size); + stride_a, ipiv, stride_ipiv, b, ldb, stride_b, + batch_size, scratchpad, scratchpad_size); } static inline void getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - stride_a, ipiv, stride_ipiv, b, ldb, stride_b, - batch_size, scratchpad, scratchpad_size); + stride_a, ipiv, stride_ipiv, b, ldb, stride_b, + batch_size, scratchpad, scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, sycl::buffer> &a, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, + sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, ipiv, - stride_ipiv, batch_size, scratchpad, - scratchpad_size); + stride_ipiv, batch_size, scratchpad, + scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, + tau, stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { + oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, + tau, stride_tau, batch_size, scratchpad, + scratchpad_size); } -static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potrf_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, - batch_size, scratchpad, scratchpad_size); + batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potrf_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, - batch_size, scratchpad, scratchpad_size); + batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, +static inline void potrf_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, std::int64_t batch_size, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, - batch_size, scratchpad, scratchpad_size); + batch_size, scratchpad, scratchpad_size); } -static inline void potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, +static inline void potrf_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, std::int64_t batch_size, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, stride_a, - batch_size, scratchpad, scratchpad_size); + batch_size, scratchpad, scratchpad_size); } -static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +static inline void potrs_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, + std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, scratchpad, - scratchpad_size); -} -static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + stride_a, b, ldb, stride_b, batch_size, scratchpad, + scratchpad_size); +} +static inline void potrs_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, + std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, scratchpad, - scratchpad_size); + stride_a, b, ldb, stride_b, batch_size, scratchpad, + scratchpad_size); } -static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, +static inline void potrs_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, scratchpad, - scratchpad_size); + stride_a, b, ldb, stride_b, batch_size, scratchpad, + scratchpad_size); } -static inline void potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &b, +static inline void potrs_batch(backend_selector selector, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, scratchpad, - scratchpad_size); + stride_a, b, ldb, stride_b, batch_size, scratchpad, + scratchpad_size); } static inline void ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, + sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, + tau, stride_tau, batch_size, scratchpad, + scratchpad_size); } static inline void ungqr_batch(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { - oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, tau, - stride_tau, batch_size, scratchpad, scratchpad_size); + oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, stride_a, + tau, stride_tau, batch_size, scratchpad, + scratchpad_size); } // USM APIs static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *d, - float *e, std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, float* d, + float* e, std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, - taup, scratchpad, scratchpad_size, dependencies); + taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *d, double *e, - double *tauq, double *taup, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* d, double* e, + double* tauq, double* taup, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, - taup, scratchpad, scratchpad_size, dependencies); + taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *d, float *e, - float *tauq, float *taup, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* d, float* e, + float* tauq, float* taup, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, - taup, scratchpad, scratchpad_size, dependencies); + taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gebrd(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gebrd(selector.get_queue(), m, n, a, lda, d, e, tauq, - taup, scratchpad, scratchpad_size, dependencies); + taup, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event gerqf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::gerqf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + std::int64_t n, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + std::int64_t n, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event geqrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, scratchpad, - scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf(selector.get_queue(), m, n, a, lda, tau, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, float* a, std::int64_t lda, std::int64_t* ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrf(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf(selector.get_queue(), m, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event getri(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri(selector.get_queue(), n, a, lda, ipiv, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, - b, ldb, scratchpad, scratchpad_size, dependencies); + b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, - b, ldb, scratchpad, scratchpad_size, dependencies); + b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, - b, ldb, scratchpad, scratchpad_size, dependencies); + b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs(selector.get_queue(), trans, n, nrhs, a, lda, ipiv, - b, ldb, scratchpad, scratchpad_size, dependencies); + b, ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, - u, ldu, vt, ldvt, scratchpad, scratchpad_size, - dependencies); + u, ldu, vt, ldvt, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, - u, ldu, vt, ldvt, scratchpad, scratchpad_size, - dependencies); + u, ldu, vt, ldvt, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::int64_t n, std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, - u, ldu, vt, ldvt, scratchpad, scratchpad_size, - dependencies); + u, ldu, vt, ldvt, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event gesvd(backend_selector selector, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::gesvd(selector.get_queue(), jobu, jobvt, m, n, a, lda, s, - u, ldu, vt, ldvt, scratchpad, scratchpad_size, - dependencies); + u, ldu, vt, ldvt, scratchpad, scratchpad_size, + dependencies); } -static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *w, std::complex *scratchpad, - std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event heevd(backend_selector selector, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } -static inline sycl::event heevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *w, std::complex *scratchpad, - std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event heevd(backend_selector selector, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::heevd(selector.get_queue(), jobz, uplo, n, a, lda, w, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, float *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, - b, ldb, w, scratchpad, scratchpad_size, - dependencies); + b, ldb, w, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event hegvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, double *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hegvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, - b, ldb, w, scratchpad, scratchpad_size, - dependencies); + b, ldb, w, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hetrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event hetrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::hetrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr(selector.get_queue(), m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgtr(selector.get_queue(), uplo, n, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, - lda, tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + lda, tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ormtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormtr(selector.get_queue(), side, uplo, trans, m, n, a, - lda, tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + lda, tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ormrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormrq(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ormqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ormqr(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potri(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potri(selector.get_queue(), uplo, n, a, lda, scratchpad, - scratchpad_size, dependencies); + scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, + ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size, dependencies); + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, + ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size, dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, + ldb, scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, ldb, - scratchpad, scratchpad_size, dependencies); -} -static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *w, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::potrs(selector.get_queue(), uplo, n, nrhs, a, lda, b, + ldb, scratchpad, scratchpad_size, dependencies); +} +static inline sycl::event syevd(backend_selector selector, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* w, double* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } -static inline sycl::event syevd(backend_selector selector, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *w, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { +static inline sycl::event syevd(backend_selector selector, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* w, float* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::syevd(selector.get_queue(), jobz, uplo, n, a, lda, w, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *b, std::int64_t ldb, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* a, std::int64_t lda, double* b, std::int64_t ldb, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, - b, ldb, w, scratchpad, scratchpad_size, - dependencies); + b, ldb, w, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event sygvd(backend_selector selector, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *b, std::int64_t ldb, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* a, std::int64_t lda, float* b, std::int64_t ldb, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sygvd(selector.get_queue(), itype, jobz, uplo, n, a, lda, - b, ldb, w, scratchpad, scratchpad_size, - dependencies); + b, ldb, w, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tau, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrd(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tau, float *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrd(selector.get_queue(), uplo, n, a, lda, d, e, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event sytrf(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::sytrf(selector.get_queue(), uplo, n, a, lda, ipiv, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, - lda, b, ldb, scratchpad, scratchpad_size, - dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, + a, lda, b, ldb, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, - lda, b, ldb, scratchpad, scratchpad_size, - dependencies); + double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, + a, lda, b, ldb, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, - lda, b, ldb, scratchpad, scratchpad_size, - dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, + a, lda, b, ldb, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event trtrs(backend_selector selector, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, a, - lda, b, ldb, scratchpad, scratchpad_size, - dependencies); + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::trtrs(selector.get_queue(), uplo, trans, diag, n, nrhs, + a, lda, b, ldb, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungbr(backend_selector selector, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungbr(selector.get_queue(), vec, m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr(selector.get_queue(), m, n, k, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungtr(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungtr(selector.get_queue(), uplo, n, a, lda, tau, - scratchpad, scratchpad_size, dependencies); + scratchpad, scratchpad_size, dependencies); } static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event unmrq(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmrq(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event unmqr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmqr(selector.get_queue(), side, trans, m, n, k, a, lda, - tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, - lda, tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + lda, tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event unmtr(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::unmtr(selector.get_queue(), side, uplo, trans, m, n, a, - lda, tau, c, ldc, scratchpad, scratchpad_size, - dependencies); + lda, tau, c, ldc, scratchpad, scratchpad_size, + dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - tau, stride_tau, batch_size, scratchpad, - scratchpad_size, dependencies); + tau, stride_tau, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - tau, stride_tau, batch_size, scratchpad, - scratchpad_size, dependencies); + tau, stride_tau, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - tau, stride_tau, batch_size, scratchpad, - scratchpad_size, dependencies); + tau, stride_tau, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - tau, stride_tau, batch_size, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { - return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + tau, stride_tau, batch_size, scratchpad, + scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } -static inline sycl::event geqrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event geqrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event geqrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event geqrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { + return oneapi::mkl::lapack::rocsolver::geqrf_batch(selector.get_queue(), m, n, a, lda, tau, + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getrf_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, - std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, - std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, - std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getrf_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getrf_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrf_batch(selector.get_queue(), m, n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getri_batch(backend_selector selector, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, stride_a, - ipiv, stride_ipiv, batch_size, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + ipiv, stride_ipiv, batch_size, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getri_batch(backend_selector selector, + std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event getri_batch(backend_selector selector, + std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getri_batch(backend_selector selector, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } -static inline sycl::event getri_batch(backend_selector selector, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +static inline sycl::event getri_batch(backend_selector selector, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getri_batch(selector.get_queue(), n, a, lda, ipiv, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch( selector.get_queue(), trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, - std::int64_t **ipiv, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, + std::int64_t** ipiv, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - ipiv, b, ldb, group_count, group_sizes, - scratchpad, scratchpad_size, dependencies); + ipiv, b, ldb, group_count, group_sizes, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, - std::int64_t **ipiv, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, + std::int64_t** ipiv, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - ipiv, b, ldb, group_count, group_sizes, - scratchpad, scratchpad_size, dependencies); + ipiv, b, ldb, group_count, group_sizes, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - ipiv, b, ldb, group_count, group_sizes, - scratchpad, scratchpad_size, dependencies); + ipiv, b, ldb, group_count, group_sizes, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event getrs_batch( - backend_selector selector, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + backend_selector selector, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::getrs_batch(selector.get_queue(), trans, n, nrhs, a, lda, - ipiv, b, ldb, group_count, group_sizes, - scratchpad, scratchpad_size, dependencies); + ipiv, b, ldb, group_count, group_sizes, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, - stride_a, tau, stride_tau, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, tau, stride_tau, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, - stride_a, tau, stride_tau, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, tau, stride_tau, batch_size, + scratchpad, scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +static inline sycl::event orgqr_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } -static inline sycl::event orgqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +static inline sycl::event orgqr_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::int64_t* k, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::orgqr_batch(selector.get_queue(), m, n, k, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - stride_a, batch_size, scratchpad, - scratchpad_size, dependencies); + stride_a, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - stride_a, batch_size, scratchpad, - scratchpad_size, dependencies); + stride_a, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - stride_a, batch_size, scratchpad, - scratchpad_size, dependencies); + stride_a, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - stride_a, batch_size, scratchpad, - scratchpad_size, dependencies); + stride_a, batch_size, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrf_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrf_batch(selector.get_queue(), uplo, n, a, lda, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, b, ldb, stride_b, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, b, ldb, stride_b, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, b, ldb, stride_b, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - stride_a, b, ldb, stride_b, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, b, ldb, stride_b, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - b, ldb, group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + b, ldb, group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - b, ldb, group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + b, ldb, group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - b, ldb, group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + b, ldb, group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event potrs_batch(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::potrs_batch(selector.get_queue(), uplo, n, nrhs, a, lda, - b, ldb, group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + b, ldb, group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, - stride_a, tau, stride_tau, batch_size, - scratchpad, scratchpad_size, dependencies); + stride_a, tau, stride_tau, batch_size, + scratchpad, scratchpad_size, dependencies); } static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, + std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, - stride_a, tau, stride_tau, batch_size, - scratchpad, scratchpad_size, dependencies); -} -static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + stride_a, tau, stride_tau, batch_size, + scratchpad, scratchpad_size, dependencies); +} +static inline sycl::event ungqr_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, + std::int64_t scratchpad_size, + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); -} -static inline sycl::event ungqr_batch(backend_selector selector, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); +} +static inline sycl::event ungqr_batch(backend_selector selector, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return oneapi::mkl::lapack::rocsolver::ungqr_batch(selector.get_queue(), m, n, k, a, lda, tau, - group_count, group_sizes, scratchpad, - scratchpad_size, dependencies); + group_count, group_sizes, scratchpad, + scratchpad_size, dependencies); } // SCRATCHPAD APIs template std::int64_t gebrd_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size(selector.get_queue(), m, n, - lda); + return oneapi::mkl::lapack::rocsolver::gebrd_scratchpad_size(selector.get_queue(), m, + n, lda); } template std::int64_t gerqf_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size(selector.get_queue(), m, n, - lda); + return oneapi::mkl::lapack::rocsolver::gerqf_scratchpad_size(selector.get_queue(), m, + n, lda); } template std::int64_t geqrf_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size(selector.get_queue(), m, n, - lda); + return oneapi::mkl::lapack::rocsolver::geqrf_scratchpad_size(selector.get_queue(), m, + n, lda); } template std::int64_t gesvd_scratchpad_size(backend_selector selector, @@ -2321,82 +2343,83 @@ std::int64_t gesvd_scratchpad_size(backend_selector selector template std::int64_t getrf_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size(selector.get_queue(), m, n, - lda); + return oneapi::mkl::lapack::rocsolver::getrf_scratchpad_size(selector.get_queue(), m, + n, lda); } template std::int64_t getri_scratchpad_size(backend_selector selector, std::int64_t n, std::int64_t lda) { return oneapi::mkl::lapack::rocsolver::getri_scratchpad_size(selector.get_queue(), n, - lda); + lda); } template std::int64_t getrs_scratchpad_size(backend_selector selector, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return oneapi::mkl::lapack::rocsolver::getrs_scratchpad_size(selector.get_queue(), - trans, n, nrhs, lda, ldb); + trans, n, nrhs, lda, ldb); } template std::int64_t heevd_scratchpad_size(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::heevd_scratchpad_size(selector.get_queue(), jobz, - uplo, n, lda); + return oneapi::mkl::lapack::rocsolver::heevd_scratchpad_size(selector.get_queue(), + jobz, uplo, n, lda); } template -std::int64_t hegvd_scratchpad_size(backend_selector selector, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t lda, std::int64_t ldb) { +std::int64_t hegvd_scratchpad_size(backend_selector selector, + std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, + std::int64_t ldb) { return oneapi::mkl::lapack::rocsolver::hegvd_scratchpad_size( selector.get_queue(), itype, jobz, uplo, n, lda, ldb); } template std::int64_t hetrd_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::hetrd_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::hetrd_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t hetrf_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::hetrf_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::hetrf_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t orgbr_scratchpad_size(backend_selector selector, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::orgbr_scratchpad_size(selector.get_queue(), vect, - m, n, k, lda); + return oneapi::mkl::lapack::rocsolver::orgbr_scratchpad_size(selector.get_queue(), + vect, m, n, k, lda); } template std::int64_t orgtr_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::orgtr_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::orgtr_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t orgqr_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::orgqr_scratchpad_size(selector.get_queue(), m, n, - k, lda); + return oneapi::mkl::lapack::rocsolver::orgqr_scratchpad_size(selector.get_queue(), m, + n, k, lda); } template std::int64_t ormrq_scratchpad_size(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { - return oneapi::mkl::lapack::rocsolver::ormrq_scratchpad_size(selector.get_queue(), side, - trans, m, n, k, lda, ldc); + return oneapi::mkl::lapack::rocsolver::ormrq_scratchpad_size( + selector.get_queue(), side, trans, m, n, k, lda, ldc); } template std::int64_t ormqr_scratchpad_size(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { - return oneapi::mkl::lapack::rocsolver::ormqr_scratchpad_size(selector.get_queue(), side, - trans, m, n, k, lda, ldc); + return oneapi::mkl::lapack::rocsolver::ormqr_scratchpad_size( + selector.get_queue(), side, trans, m, n, k, lda, ldc); } template std::int64_t ormtr_scratchpad_size(backend_selector selector, @@ -2409,47 +2432,48 @@ std::int64_t ormtr_scratchpad_size(backend_selector selector template std::int64_t potrf_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::potrf_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t potrs_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { - return oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size(selector.get_queue(), uplo, - n, nrhs, lda, ldb); + return oneapi::mkl::lapack::rocsolver::potrs_scratchpad_size(selector.get_queue(), + uplo, n, nrhs, lda, ldb); } template std::int64_t potri_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::potri_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::potri_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t sytrf_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::sytrf_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t syevd_scratchpad_size(backend_selector selector, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::syevd_scratchpad_size(selector.get_queue(), jobz, - uplo, n, lda); + return oneapi::mkl::lapack::rocsolver::syevd_scratchpad_size(selector.get_queue(), + jobz, uplo, n, lda); } template -std::int64_t sygvd_scratchpad_size(backend_selector selector, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t lda, std::int64_t ldb) { +std::int64_t sygvd_scratchpad_size(backend_selector selector, + std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, + std::int64_t ldb) { return oneapi::mkl::lapack::rocsolver::sygvd_scratchpad_size( selector.get_queue(), itype, jobz, uplo, n, lda, ldb); } template std::int64_t sytrd_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::sytrd_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::sytrd_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t trtrs_scratchpad_size(backend_selector selector, @@ -2463,36 +2487,36 @@ template std::int64_t ungbr_scratchpad_size(backend_selector selector, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::ungbr_scratchpad_size(selector.get_queue(), vect, - m, n, k, lda); + return oneapi::mkl::lapack::rocsolver::ungbr_scratchpad_size(selector.get_queue(), + vect, m, n, k, lda); } template std::int64_t ungqr_scratchpad_size(backend_selector selector, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::ungqr_scratchpad_size(selector.get_queue(), m, n, - k, lda); + return oneapi::mkl::lapack::rocsolver::ungqr_scratchpad_size(selector.get_queue(), m, + n, k, lda); } template std::int64_t ungtr_scratchpad_size(backend_selector selector, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { - return oneapi::mkl::lapack::rocsolver::ungtr_scratchpad_size(selector.get_queue(), uplo, - n, lda); + return oneapi::mkl::lapack::rocsolver::ungtr_scratchpad_size(selector.get_queue(), + uplo, n, lda); } template std::int64_t unmrq_scratchpad_size(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { - return oneapi::mkl::lapack::rocsolver::unmrq_scratchpad_size(selector.get_queue(), side, - trans, m, n, k, lda, ldc); + return oneapi::mkl::lapack::rocsolver::unmrq_scratchpad_size( + selector.get_queue(), side, trans, m, n, k, lda, ldc); } template std::int64_t unmqr_scratchpad_size(backend_selector selector, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { - return oneapi::mkl::lapack::rocsolver::unmqr_scratchpad_size(selector.get_queue(), side, - trans, m, n, k, lda, ldc); + return oneapi::mkl::lapack::rocsolver::unmqr_scratchpad_size( + selector.get_queue(), side, trans, m, n, k, lda, ldc); } template std::int64_t unmtr_scratchpad_size(backend_selector selector, @@ -2568,62 +2592,62 @@ std::int64_t ungqr_batch_scratchpad_size(backend_selector se } template std::int64_t getrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::getrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template std::int64_t getri_batch_scratchpad_size(backend_selector selector, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::getri_batch_scratchpad_size( selector.get_queue(), n, lda, group_count, group_sizes); } template std::int64_t getrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::getrs_batch_scratchpad_size( selector.get_queue(), trans, n, nrhs, lda, ldb, group_count, group_sizes); } template std::int64_t geqrf_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::geqrf_batch_scratchpad_size( selector.get_queue(), m, n, lda, group_count, group_sizes); } template std::int64_t orgqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::orgqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } template std::int64_t potrf_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::potrf_batch_scratchpad_size( selector.get_queue(), uplo, n, lda, group_count, group_sizes); } template std::int64_t potrs_batch_scratchpad_size(backend_selector selector, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::potrs_batch_scratchpad_size( selector.get_queue(), uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template std::int64_t ungqr_batch_scratchpad_size(backend_selector selector, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return oneapi::mkl::lapack::rocsolver::ungqr_batch_scratchpad_size( selector.get_queue(), m, n, k, lda, group_count, group_sizes); } diff --git a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx b/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx index c68009e54..3b205f606 100644 --- a/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx +++ b/include/oneapi/mkl/lapack/detail/rocsolver/onemkl_lapack_rocsolver.hxx @@ -21,1815 +21,1812 @@ // Buffer APIs -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, - sycl::buffer &taup, sycl::buffer &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, + sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, +ONEMKL_EXPORT void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +ONEMKL_EXPORT void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, +ONEMKL_EXPORT void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, +ONEMKL_EXPORT void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, +ONEMKL_EXPORT void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getri_batch(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getri_batch(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, +ONEMKL_EXPORT void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, +ONEMKL_EXPORT void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); + sycl::buffer& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, +ONEMKL_EXPORT void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -ONEMKL_EXPORT void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +ONEMKL_EXPORT void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); // USM APIs -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tauq, double *taup, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tauq, double* taup, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, - double *b, std::int64_t ldb, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, - float *b, std::int64_t ldb, float *scratchpad, +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *s, double *u, std::int64_t ldu, - double *vt, std::int64_t ldvt, double *scratchpad, + double* a, std::int64_t lda, double* s, double* u, std::int64_t ldu, + double* vt, std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt, - std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* s, float* u, std::int64_t ldu, float* vt, + std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, - std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, + std::complex* a, std::int64_t lda, double* s, + std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - double *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - float *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - double *w, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, +ONEMKL_EXPORT sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *w, double *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *w, float *scratchpad, +ONEMKL_EXPORT sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *scratchpad, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *scratchpad, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); - -ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); + +ONEMKL_EXPORT sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +ONEMKL_EXPORT sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); // SCRATCHPAD APIs template -ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +ONEMKL_EXPORT std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template -ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +ONEMKL_EXPORT std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t heevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t hegvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t hetrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t hetrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +ONEMKL_EXPORT std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +ONEMKL_EXPORT std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +ONEMKL_EXPORT std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template -ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +ONEMKL_EXPORT std::int64_t ungbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +ONEMKL_EXPORT std::int64_t ungqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t ungtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template -ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +ONEMKL_EXPORT std::int64_t unmtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, - oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, - std::int64_t lda, std::int64_t stride_a, - std::int64_t stride_ipiv, std::int64_t ldb, - std::int64_t stride_b, - std::int64_t batch_size); +ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, +ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template -ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template ONEMKL_EXPORT std::int64_t getrs_batch_scratchpad_size( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +ONEMKL_EXPORT std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +ONEMKL_EXPORT std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); template -ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +ONEMKL_EXPORT std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); diff --git a/include/oneapi/mkl/lapack/exceptions.hpp b/include/oneapi/mkl/lapack/exceptions.hpp index da205cc1a..59de3b4de 100644 --- a/include/oneapi/mkl/lapack/exceptions.hpp +++ b/include/oneapi/mkl/lapack/exceptions.hpp @@ -25,7 +25,7 @@ namespace lapack { class exception { public: - exception(oneapi::mkl::exception *_ex, std::int64_t info, std::int64_t detail = 0) + exception(oneapi::mkl::exception* _ex, std::int64_t info, std::int64_t detail = 0) : _info(info), _detail(detail), _ex(_ex) {} @@ -35,20 +35,20 @@ class exception { std::int64_t detail() const { return _detail; } - const char *what() const { + const char* what() const { return _ex->what(); } private: std::int64_t _info; std::int64_t _detail; - mkl::exception *_ex; + mkl::exception* _ex; }; class computation_error : public oneapi::mkl::computation_error, public oneapi::mkl::lapack::exception { public: - computation_error(const std::string &function, const std::string &info, std::int64_t code) + computation_error(const std::string& function, const std::string& info, std::int64_t code) : oneapi::mkl::computation_error("LAPACK", function, info), oneapi::mkl::lapack::exception(this, code) {} using oneapi::mkl::computation_error::what; @@ -56,17 +56,17 @@ class computation_error : public oneapi::mkl::computation_error, class batch_error : public oneapi::mkl::batch_error, public oneapi::mkl::lapack::exception { public: - batch_error(const std::string &function, const std::string &info, std::int64_t num_errors, + batch_error(const std::string& function, const std::string& info, std::int64_t num_errors, std::vector ids = {}, std::vector exceptions = {}) : oneapi::mkl::batch_error("LAPACK", function, info), oneapi::mkl::lapack::exception(this, num_errors), _ids(ids), _exceptions(exceptions) {} using oneapi::mkl::batch_error::what; - const std::vector &ids() const { + const std::vector& ids() const { return _ids; } - const std::vector &exceptions() const { + const std::vector& exceptions() const { return _exceptions; } @@ -78,7 +78,7 @@ class batch_error : public oneapi::mkl::batch_error, public oneapi::mkl::lapack: class invalid_argument : public oneapi::mkl::invalid_argument, public oneapi::mkl::lapack::exception { public: - invalid_argument(const std::string &function, const std::string &info, + invalid_argument(const std::string& function, const std::string& info, std::int64_t arg_position = 0, std::int64_t detail = 0) : oneapi::mkl::invalid_argument("LAPACK", function, info), oneapi::mkl::lapack::exception(this, arg_position, detail) {} diff --git a/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp b/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp index ace216f00..75ee22211 100644 --- a/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp +++ b/include/oneapi/mkl/sparse_blas/detail/helper_types.hpp @@ -40,7 +40,7 @@ inline constexpr bool is_int_supported_v = template inline constexpr bool are_fp_int_supported_v = - is_fp_supported_v&& is_int_supported_v; + is_fp_supported_v && is_int_supported_v; } // namespace detail } // namespace sparse diff --git a/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx b/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx index 4b701eb6f..e25fff46e 100644 --- a/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx +++ b/include/oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx @@ -22,196 +22,196 @@ // Dense vector template -ONEMKL_EXPORT void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, +ONEMKL_EXPORT void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, sycl::buffer val); template -ONEMKL_EXPORT void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, - std::int64_t size, dataType *val); +ONEMKL_EXPORT void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, + std::int64_t size, dataType* val); template -ONEMKL_EXPORT void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, +ONEMKL_EXPORT void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size, sycl::buffer val); template -ONEMKL_EXPORT void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, - std::int64_t size, dataType *val); +ONEMKL_EXPORT void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, + std::int64_t size, dataType* val); -ONEMKL_EXPORT sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector& dependencies = {}); // Dense matrix template -ONEMKL_EXPORT void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, +ONEMKL_EXPORT void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val); template -ONEMKL_EXPORT void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, +ONEMKL_EXPORT void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, - layout dense_layout, dataType *val); + layout dense_layout, dataType* val); template -ONEMKL_EXPORT void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, +ONEMKL_EXPORT void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val); template -ONEMKL_EXPORT void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, +ONEMKL_EXPORT void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t ld, layout dense_layout, dataType *val); + std::int64_t ld, layout dense_layout, dataType* val); -ONEMKL_EXPORT sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector& dependencies = {}); // COO matrix template -ONEMKL_EXPORT void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, +ONEMKL_EXPORT void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val); template -ONEMKL_EXPORT void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, +ONEMKL_EXPORT void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - index_base index, indexType *row_ind, indexType *col_ind, - dataType *val); + index_base index, indexType* row_ind, indexType* col_ind, + dataType* val); template -ONEMKL_EXPORT void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, +ONEMKL_EXPORT void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val); template -ONEMKL_EXPORT void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, +ONEMKL_EXPORT void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ind, - indexType *col_ind, dataType *val); + std::int64_t nnz, index_base index, indexType* row_ind, + indexType* col_ind, dataType* val); // CSR matrix template -ONEMKL_EXPORT void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, +ONEMKL_EXPORT void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val); template -ONEMKL_EXPORT void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, +ONEMKL_EXPORT void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - index_base index, indexType *row_ptr, indexType *col_ind, - dataType *val); + index_base index, indexType* row_ptr, indexType* col_ind, + dataType* val); template -ONEMKL_EXPORT void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, +ONEMKL_EXPORT void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val); template -ONEMKL_EXPORT void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, +ONEMKL_EXPORT void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ptr, - indexType *col_ind, dataType *val); + std::int64_t nnz, index_base index, indexType* row_ptr, + indexType* col_ind, dataType* val); // Common sparse matrix functions -ONEMKL_EXPORT sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, + const std::vector& dependencies = {}); -bool set_matrix_property(sycl::queue &queue, matrix_handle_t smhandle, matrix_property property); +bool set_matrix_property(sycl::queue& queue, matrix_handle_t smhandle, matrix_property property); // SPMM -ONEMKL_EXPORT void init_spmm_descr(sycl::queue &queue, spmm_descr_t *p_spmm_descr); +ONEMKL_EXPORT void init_spmm_descr(sycl::queue& queue, spmm_descr_t* p_spmm_descr); -ONEMKL_EXPORT sycl::event release_spmm_descr(sycl::queue &queue, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies = {}); -ONEMKL_EXPORT void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, +ONEMKL_EXPORT void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, std::size_t &temp_buffer_size); + spmm_descr_t spmm_descr, std::size_t& temp_buffer_size); -ONEMKL_EXPORT void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +ONEMKL_EXPORT void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace); -ONEMKL_EXPORT sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, +ONEMKL_EXPORT sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, void *workspace, - const std::vector &dependencies = {}); + spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event spmm(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +ONEMKL_EXPORT sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); // SPMV -ONEMKL_EXPORT void init_spmv_descr(sycl::queue &queue, spmv_descr_t *p_spmv_descr); +ONEMKL_EXPORT void init_spmv_descr(sycl::queue& queue, spmv_descr_t* p_spmv_descr); -ONEMKL_EXPORT sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies = {}); -ONEMKL_EXPORT void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, +ONEMKL_EXPORT void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size); + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size); -ONEMKL_EXPORT void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +ONEMKL_EXPORT void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer workspace); -ONEMKL_EXPORT sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, +ONEMKL_EXPORT sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, - spmv_alg alg, spmv_descr_t spmv_descr, void *workspace, - const std::vector &dependencies = {}); + const void* beta, dense_vector_handle_t y_handle, + spmv_alg alg, spmv_descr_t spmv_descr, void* workspace, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +ONEMKL_EXPORT sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); // SPSV -ONEMKL_EXPORT void init_spsv_descr(sycl::queue &queue, spsv_descr_t *p_spsv_descr); +ONEMKL_EXPORT void init_spsv_descr(sycl::queue& queue, spsv_descr_t* p_spsv_descr); -ONEMKL_EXPORT sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}); +ONEMKL_EXPORT sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector& dependencies = {}); -ONEMKL_EXPORT void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, +ONEMKL_EXPORT void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); -ONEMKL_EXPORT void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +ONEMKL_EXPORT void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace); -ONEMKL_EXPORT sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, +ONEMKL_EXPORT sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, - spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies = {}); + spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies = {}); -ONEMKL_EXPORT sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +ONEMKL_EXPORT sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx index ca09d09d4..aacc32ce3 100644 --- a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx +++ b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_ct.hxx @@ -27,14 +27,14 @@ // Dense vector template std::enable_if_t> init_dense_vector( - backend_selector selector, dense_vector_handle_t *p_dvhandle, + backend_selector selector, dense_vector_handle_t* p_dvhandle, std::int64_t size, sycl::buffer val) { BACKEND::init_dense_vector(selector.get_queue(), p_dvhandle, size, val); } template std::enable_if_t> init_dense_vector( - backend_selector selector, dense_vector_handle_t *p_dvhandle, - std::int64_t size, dataType *val) { + backend_selector selector, dense_vector_handle_t* p_dvhandle, + std::int64_t size, dataType* val) { BACKEND::init_dense_vector(selector.get_queue(), p_dvhandle, size, val); } @@ -47,20 +47,20 @@ std::enable_if_t> set_dense_vector_data( template std::enable_if_t> set_dense_vector_data( backend_selector selector, dense_vector_handle_t dvhandle, std::int64_t size, - dataType *val) { + dataType* val) { BACKEND::set_dense_vector_data(selector.get_queue(), dvhandle, size, val); } inline sycl::event release_dense_vector(backend_selector selector, dense_vector_handle_t dvhandle, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_dense_vector(selector.get_queue(), dvhandle, dependencies); } // Dense matrix template std::enable_if_t> init_dense_matrix( - backend_selector selector, dense_matrix_handle_t *p_dmhandle, + backend_selector selector, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val) { BACKEND::init_dense_matrix(selector.get_queue(), p_dmhandle, num_rows, num_cols, ld, @@ -68,9 +68,9 @@ std::enable_if_t> init_dense_matrix( } template std::enable_if_t> init_dense_matrix( - backend_selector selector, dense_matrix_handle_t *p_dmhandle, + backend_selector selector, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, - dataType *val) { + dataType* val) { BACKEND::init_dense_matrix(selector.get_queue(), p_dmhandle, num_rows, num_cols, ld, dense_layout, val); } @@ -87,21 +87,21 @@ template std::enable_if_t> set_dense_matrix_data( backend_selector selector, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, - dataType *val) { + dataType* val) { BACKEND::set_dense_matrix_data(selector.get_queue(), dmhandle, num_rows, num_cols, ld, dense_layout, val); } inline sycl::event release_dense_matrix(backend_selector selector, dense_matrix_handle_t dmhandle, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_dense_matrix(selector.get_queue(), dmhandle, dependencies); } // COO matrix template std::enable_if_t> init_coo_matrix( - backend_selector selector, matrix_handle_t *p_smhandle, std::int64_t num_rows, + backend_selector selector, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { BACKEND::init_coo_matrix(selector.get_queue(), p_smhandle, num_rows, num_cols, nnz, index, @@ -109,9 +109,9 @@ std::enable_if_t> init_coo_m } template std::enable_if_t> init_coo_matrix( - backend_selector selector, matrix_handle_t *p_smhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, index_base index, indexType *row_ind, - indexType *col_ind, dataType *val) { + backend_selector selector, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, index_base index, indexType* row_ind, + indexType* col_ind, dataType* val) { BACKEND::init_coo_matrix(selector.get_queue(), p_smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); } @@ -127,8 +127,8 @@ std::enable_if_t> set_coo_ma template std::enable_if_t> set_coo_matrix_data( backend_selector selector, matrix_handle_t smhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, index_base index, indexType *row_ind, - indexType *col_ind, dataType *val) { + std::int64_t num_cols, std::int64_t nnz, index_base index, indexType* row_ind, + indexType* col_ind, dataType* val) { BACKEND::set_coo_matrix_data(selector.get_queue(), smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); } @@ -136,7 +136,7 @@ std::enable_if_t> set_coo_ma // CSR matrix template std::enable_if_t> init_csr_matrix( - backend_selector selector, matrix_handle_t *p_smhandle, std::int64_t num_rows, + backend_selector selector, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { BACKEND::init_csr_matrix(selector.get_queue(), p_smhandle, num_rows, num_cols, nnz, index, @@ -144,9 +144,9 @@ std::enable_if_t> init_csr_m } template std::enable_if_t> init_csr_matrix( - backend_selector selector, matrix_handle_t *p_smhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, index_base index, indexType *row_ptr, - indexType *col_ind, dataType *val) { + backend_selector selector, matrix_handle_t* p_smhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t nnz, index_base index, indexType* row_ptr, + indexType* col_ind, dataType* val) { BACKEND::init_csr_matrix(selector.get_queue(), p_smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); } @@ -162,8 +162,8 @@ std::enable_if_t> set_csr_ma template std::enable_if_t> set_csr_matrix_data( backend_selector selector, matrix_handle_t smhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t nnz, index_base index, indexType *row_ptr, - indexType *col_ind, dataType *val) { + std::int64_t num_cols, std::int64_t nnz, index_base index, indexType* row_ptr, + indexType* col_ind, dataType* val) { BACKEND::set_csr_matrix_data(selector.get_queue(), smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); } @@ -171,7 +171,7 @@ std::enable_if_t> set_csr_ma // Common sparse matrix functions inline sycl::event release_sparse_matrix(backend_selector selector, matrix_handle_t smhandle, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_sparse_matrix(selector.get_queue(), smhandle, dependencies); } @@ -182,30 +182,30 @@ inline bool set_matrix_property(backend_selector selector, // SPMM inline void init_spmm_descr(backend_selector selector, - spmm_descr_t *p_spmm_descr) { + spmm_descr_t* p_spmm_descr) { BACKEND::init_spmm_descr(selector.get_queue(), p_spmm_descr); } inline sycl::event release_spmm_descr(backend_selector selector, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_spmm_descr(selector.get_queue(), spmm_descr, dependencies); } inline void spmm_buffer_size(backend_selector selector, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { BACKEND::spmm_buffer_size(selector.get_queue(), opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, temp_buffer_size); } inline void spmm_optimize(backend_selector selector, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace) { BACKEND::spmm_optimize(selector.get_queue(), opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, workspace); @@ -213,48 +213,48 @@ inline void spmm_optimize(backend_selector selector, oneapi::m inline sycl::event spmm_optimize(backend_selector selector, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, void *workspace, - const std::vector &dependencies = {}) { + spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies = {}) { return BACKEND::spmm_optimize(selector.get_queue(), opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, workspace, dependencies); } inline sycl::event spmm(backend_selector selector, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, - matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void *beta, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::spmm(selector.get_queue(), opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, dependencies); } // SPMV inline void init_spmv_descr(backend_selector selector, - spmv_descr_t *p_spmv_descr) { + spmv_descr_t* p_spmv_descr) { BACKEND::init_spmv_descr(selector.get_queue(), p_spmv_descr); } inline sycl::event release_spmv_descr(backend_selector selector, spmv_descr_t spmv_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_spmv_descr(selector.get_queue(), spmv_descr, dependencies); } inline void spmv_buffer_size(backend_selector selector, - oneapi::mkl::transpose opA, const void *alpha, matrix_view A_view, + oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { BACKEND::spmv_buffer_size(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, temp_buffer_size); } inline void spmv_optimize(backend_selector selector, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer workspace) { BACKEND::spmv_optimize(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, beta, @@ -262,47 +262,47 @@ inline void spmv_optimize(backend_selector selector, oneapi::m } inline sycl::event spmv_optimize(backend_selector selector, - oneapi::mkl::transpose opA, const void *alpha, matrix_view A_view, + oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, void *workspace, - const std::vector &dependencies = {}) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, void* workspace, + const std::vector& dependencies = {}) { return BACKEND::spmv_optimize(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, workspace, dependencies); } inline sycl::event spmv(backend_selector selector, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::spmv(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, dependencies); } // SPSV inline void init_spsv_descr(backend_selector selector, - spsv_descr_t *p_spsv_descr) { + spsv_descr_t* p_spsv_descr) { BACKEND::init_spsv_descr(selector.get_queue(), p_spsv_descr); } inline sycl::event release_spsv_descr(backend_selector selector, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::release_spsv_descr(selector.get_queue(), spsv_descr, dependencies); } inline void spsv_buffer_size(backend_selector selector, - oneapi::mkl::transpose opA, const void *alpha, matrix_view A_view, + oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { BACKEND::spsv_buffer_size(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, temp_buffer_size); } inline void spsv_optimize(backend_selector selector, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace) { @@ -311,20 +311,20 @@ inline void spsv_optimize(backend_selector selector, oneapi::m } inline sycl::event spsv_optimize(backend_selector selector, - oneapi::mkl::transpose opA, const void *alpha, matrix_view A_view, + oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, - spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies = {}) { + spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies = {}) { return BACKEND::spsv_optimize(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, workspace, dependencies); } inline sycl::event spsv(backend_selector selector, oneapi::mkl::transpose opA, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}) { + const std::vector& dependencies = {}) { return BACKEND::spsv(selector.get_queue(), opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, dependencies); } diff --git a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp index 86a00f507..e99613ba3 100644 --- a/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp +++ b/include/oneapi/mkl/sparse_blas/detail/sparse_blas_rt.hpp @@ -30,176 +30,176 @@ namespace sparse { // Dense vector template std::enable_if_t> init_dense_vector( - sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, + sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, sycl::buffer val); template std::enable_if_t> init_dense_vector( - sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, dataType *val); + sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, dataType* val); template std::enable_if_t> set_dense_vector_data( - sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, + sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size, sycl::buffer val); template std::enable_if_t> set_dense_vector_data( - sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, dataType *val); + sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size, dataType* val); -sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, - const std::vector &dependencies = {}); +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector& dependencies = {}); // Dense matrix template std::enable_if_t> init_dense_matrix( - sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, + sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val); template std::enable_if_t> init_dense_matrix( - sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t ld, layout dense_layout, dataType *val); + sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, dataType* val); template std::enable_if_t> set_dense_matrix_data( - sycl::queue &queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, + sycl::queue& queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val); template std::enable_if_t> set_dense_matrix_data( - sycl::queue &queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t ld, layout dense_layout, dataType *val); + sycl::queue& queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, dataType* val); -sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, - const std::vector &dependencies = {}); +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector& dependencies = {}); // COO matrix template std::enable_if_t> init_coo_matrix( - sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, std::int64_t num_cols, + sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val); template std::enable_if_t> init_coo_matrix( - sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ind, indexType *col_ind, dataType *val); + sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, + std::int64_t nnz, index_base index, indexType* row_ind, indexType* col_ind, dataType* val); template std::enable_if_t> set_coo_matrix_data( - sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, + sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val); template std::enable_if_t> set_coo_matrix_data( - sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ind, indexType *col_ind, dataType *val); + sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, + std::int64_t nnz, index_base index, indexType* row_ind, indexType* col_ind, dataType* val); // CSR matrix template std::enable_if_t> init_csr_matrix( - sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, std::int64_t num_cols, + sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val); template std::enable_if_t> init_csr_matrix( - sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ptr, indexType *col_ind, dataType *val); + sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, + std::int64_t nnz, index_base index, indexType* row_ptr, indexType* col_ind, dataType* val); template std::enable_if_t> set_csr_matrix_data( - sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, + sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val); template std::enable_if_t> set_csr_matrix_data( - sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, - std::int64_t nnz, index_base index, indexType *row_ptr, indexType *col_ind, dataType *val); + sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, + std::int64_t nnz, index_base index, indexType* row_ptr, indexType* col_ind, dataType* val); // Common sparse matrix functions -sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, - const std::vector &dependencies = {}); +sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, + const std::vector& dependencies = {}); -bool set_matrix_property(sycl::queue &queue, matrix_handle_t smhandle, matrix_property property); +bool set_matrix_property(sycl::queue& queue, matrix_handle_t smhandle, matrix_property property); // SPMM -void init_spmm_descr(sycl::queue &queue, spmm_descr_t *p_spmm_descr); +void init_spmm_descr(sycl::queue& queue, spmm_descr_t* p_spmm_descr); -sycl::event release_spmm_descr(sycl::queue &queue, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}); +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies = {}); -void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); -void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace); -sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, void *workspace, - const std::vector &dependencies = {}); + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies = {}); -sycl::event spmm(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); // SPMV -void init_spmv_descr(sycl::queue &queue, spmv_descr_t *p_spmv_descr); +void init_spmv_descr(sycl::queue& queue, spmv_descr_t* p_spmv_descr); -sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, - const std::vector &dependencies = {}); +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies = {}); -void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size); + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size); -void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer workspace); -sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - void *workspace, const std::vector &dependencies = {}); + void* workspace, const std::vector& dependencies = {}); -sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, const std::vector &dependencies = {}); + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector& dependencies = {}); // SPSV -void init_spsv_descr(sycl::queue &queue, spsv_descr_t *p_spsv_descr); +void init_spsv_descr(sycl::queue& queue, spsv_descr_t* p_spsv_descr); -sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}); +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector& dependencies = {}); -void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); -void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace); -sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies = {}); + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies = {}); -sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); } // namespace sparse } // namespace mkl diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp index 009bb9541..a46d2b2bb 100644 --- a/src/blas/backends/cublas/cublas_batch.cpp +++ b/src/blas/backends/cublas/cublas_batch.cpp @@ -29,122 +29,122 @@ namespace column_major { // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, float beta, sycl::buffer &y, int64_t incy, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, float beta, sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, double beta, + sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, sycl::buffer, 1> &y, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, + sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, sycl::buffer &c, int64_t ldc, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } template -inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, Ts alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - Ts beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +inline void gemm_batch_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, Ts alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + Ts beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { using cuTypeA = typename CudaEquivalentType::Type; using cuTypeB = typename CudaEquivalentType::Type; @@ -153,7 +153,7 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size); cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { if (!verify_support(queue, sycl::aspect::fp16)) { throw oneapi::mkl::unimplemented( "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); @@ -161,11 +161,11 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC( "cublasGemmStridedBatchedEx", cublasGemmStridedBatchedEx, err, handle, @@ -178,10 +178,10 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran } #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, \ int64_t batch_size) { \ gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, \ lda, stride_a, b, ldb, stride_b, beta, c, \ @@ -200,10 +200,10 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::com #undef GEMM_STRIDED_BATCH_LAUNCHER #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, \ int64_t batch_size) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ @@ -216,379 +216,379 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, - std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, + std::int64_t stridex, float* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + std::int64_t stridex, double* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float beta, float* y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, const double* x, + int64_t incx, int64_t stride_x, double beta, double* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stride_a, const double *x, int64_t incx, - int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stride_a, const double* x, int64_t incx, + int64_t stride_x, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for column_major layout"); } template -inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa, +inline sycl::event gemm_batch_strided_usm_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - Ts alpha, const Ta *a, int64_t lda, int64_t stride_a, - const Tb *b, int64_t ldb, int64_t stride_b, Ts beta, - Tc *c, int64_t ldc, int64_t stride_c, + Ts alpha, const Ta* a, int64_t lda, int64_t stride_a, + const Tb* b, int64_t ldb, int64_t stride_b, Ts beta, + Tc* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuTypeA = typename CudaEquivalentType::Type; using cuTypeB = typename CudaEquivalentType::Type; using cuTypeC = typename CudaEquivalentType::Type; @@ -596,7 +596,7 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra overflow_check(m, n, k, lda, ldb, ldc, stride_a, stride_b, stride_c, batch_size); cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { if (!verify_support(queue, sycl::aspect::fp16)) { throw oneapi::mkl::unimplemented( "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); @@ -605,7 +605,7 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC( @@ -620,11 +620,11 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra } #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stride_a, const TYPE_B* b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector& dependencies) { \ return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, \ stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, \ batch_size, dependencies); \ @@ -642,11 +642,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std: #undef GEMM_STRIDED_BATCH_LAUNCHER_USM #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stride_a, const TYPE_B* b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -659,11 +659,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb, - int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a, - int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, transpose* transb, + int64_t* m, int64_t* n, int64_t* k, Ts* alpha, const Ta** a, + int64_t* lda, const Tb** b, int64_t* ldb, Ts* beta, Tc** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using cuTypeA = typename CudaEquivalentType::Type; using cuTypeB = typename CudaEquivalentType::Type; using cuTypeC = typename CudaEquivalentType::Type; @@ -673,7 +673,7 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr } cublasGemmAlgo_t cublas_gemm_algo = CUBLAS_GEMM_DEFAULT; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { if (!verify_support(queue, sycl::aspect::fp16)) { throw oneapi::mkl::unimplemented( "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); @@ -682,7 +682,7 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; cublasStatus_t err; @@ -690,10 +690,10 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr CUBLAS_ERROR_FUNC_T_SYNC( "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle, get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i], - (int)n[i], (int)k[i], &alpha[i], (const void *const *)(a + offset), - get_cublas_datatype(), (int)lda[i], (const void *const *)(b + offset), + (int)n[i], (int)k[i], &alpha[i], (const void* const*)(a + offset), + get_cublas_datatype(), (int)lda[i], (const void* const*)(b + offset), get_cublas_datatype(), (int)ldb[i], &beta[i], - (void *const *)(c + offset), get_cublas_datatype(), (int)ldc[i], + (void* const*)(c + offset), get_cublas_datatype(), (int)ldc[i], (int)group_size[i], get_cublas_datatype(), cublas_gemm_algo); offset += group_size[i]; } @@ -703,11 +703,11 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr } #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc, group_count, group_size, dependencies); \ } @@ -724,11 +724,11 @@ GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex #undef GEMM_BATCH_LAUNCHER_USM #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -740,63 +740,63 @@ GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_BATCH_LAUNCHER_USM -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for column_major layout"); } template -inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, - int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event trsm_batch(const char* func_name, Func func, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, T* alpha, const T** a, + int64_t* lda, T** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; cublasStatus_t err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **b_ = reinterpret_cast(b); + auto** a_ = reinterpret_cast(a); + auto** b_ = reinterpret_cast(b); CUBLAS_ERROR_FUNC_T_SYNC( func_name, func, err, handle, get_cublas_side_mode(left_right[i]), get_cublas_fill_mode(upper_lower[i]), get_cublas_operation(trans[i]), get_cublas_diag_type(unit_diag[i]), (int)m[i], (int)n[i], - (cuDataType *)&alpha[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i], + (cuDataType*)&alpha[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i], (int)group_size[i]); offset += group_size[i]; } @@ -806,11 +806,11 @@ inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &que } #define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, \ + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, TYPE* alpha, \ + const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, \ dependencies); \ @@ -823,208 +823,208 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, cublasZtrsmBatched) #undef TRSM_BATCH_LAUNCHER_USM -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, float** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, double** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } @@ -1033,122 +1033,122 @@ namespace row_major { // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, float beta, sycl::buffer &y, int64_t incy, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, float beta, sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, double beta, + sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, sycl::buffer, 1> &y, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, + sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, sycl::buffer &c, int64_t ldc, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, \ int64_t batch_size) { \ throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ } @@ -1166,377 +1166,377 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::complex, std::complex, std::com #undef GEMM_STRIDED_BATCH_LAUNCHER -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, - std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, + std::int64_t stridex, float* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + std::int64_t stridex, double* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float beta, float* y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, const double* x, + int64_t incx, int64_t stride_x, double beta, double* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stride_a, const double *x, int64_t incx, - int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stride_a, const double* x, int64_t incx, + int64_t stride_x, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", "for row_major layout"); } #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stride_a, const TYPE_B *b, int64_t ldb, int64_t stride_b, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stride_c, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stride_a, const TYPE_B* b, int64_t ldb, int64_t stride_b, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stride_c, \ + int64_t batch_size, const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ } @@ -1554,11 +1554,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, std::complex, std: #undef GEMM_STRIDED_BATCH_LAUNCHER_USM #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", "for row_major layout"); \ } @@ -1575,51 +1575,51 @@ GEMM_BATCH_LAUNCHER_USM(std::complex, std::complex, std::complex #undef GEMM_BATCH_LAUNCHER_USM -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } template -inline sycl::event trsm_batch(const char *func_name, Func func, sycl::queue &queue, - side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, T *alpha, const T **a, - int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event trsm_batch(const char* func_name, Func func, sycl::queue& queue, + side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, T* alpha, const T** a, + int64_t* lda, T** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", "for row_major layout"); } #define TRSM_BATCH_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, \ + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, TYPE* alpha, \ + const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return trsm_batch(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, \ dependencies); \ @@ -1632,208 +1632,208 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, cublasZtrsmBatched) #undef TRSM_BATCH_LAUNCHER_USM -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, const float** a, int64_t* lda, float** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, const double** a, int64_t* lda, double** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + std::complex** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, float** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, double** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } diff --git a/src/blas/backends/cublas/cublas_extensions.cpp b/src/blas/backends/cublas/cublas_extensions.cpp index cc80b483d..c80392aa6 100644 --- a/src/blas/backends/cublas/cublas_extensions.cpp +++ b/src/blas/backends/cublas/cublas_extensions.cpp @@ -29,88 +29,88 @@ namespace column_major { // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } template -void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, +void omatcopy(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n); const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), get_cublas_operation(trans), logical_m, logical_n, - (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); + (cuDataType*)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); }); }); } #define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { \ omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \ } @@ -122,16 +122,16 @@ OMATCOPY_LAUNCHER(std::complex, cublasZgeam) #undef OMATCOPY_LAUNCHER template -void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb) { \ + void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb) { \ omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, b, \ ldb, strideb); \ } @@ -142,53 +142,53 @@ OMATCOPY2_LAUNCHER(std::complex, "unimplemented") OMATCOPY2_LAUNCHER(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } template -void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - T beta, sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + T beta, sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_, - lda, (cuDataType *)&beta, b_, ldb, c_, ldc); + get_cublas_operation(transb), m, n, (cuDataType*)&alpha, a_, + lda, (cuDataType*)&beta, b_, ldb, c_, ldc); }); }); } #define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { \ + void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { \ omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \ b, ldb, c, ldc); \ } @@ -202,95 +202,95 @@ OMATADD_LAUNCHER(std::complex, cublasZgeam) // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } template -sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? m : n); const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? n : m); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), get_cublas_operation(trans), logical_m, logical_n, - (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); + (cuDataType*)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); }); }); return done; } #define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ - const std::vector &dependencies) { \ + sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, TYPE* b, int64_t ldb, \ + const std::vector& dependencies) { \ return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, \ ldb, dependencies); \ } @@ -303,16 +303,16 @@ OMATCOPY_LAUNCHER_USM(std::complex, cublasZgeam) #undef OMATCOPY_LAUNCHER_USM template -sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b, - int64_t ldb, int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, int64_t stridea, T* b, + int64_t ldb, int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, \ - int64_t strideb, const std::vector &dependencies) { \ + sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, \ + int64_t strideb, const std::vector& dependencies) { \ return omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \ lda, b, ldb, strideb, dependencies); \ } @@ -323,58 +323,58 @@ OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER_USM -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } template -inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - T beta, const T *b, int64_t ldb, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event omatadd(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + T beta, const T* b, int64_t ldb, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, (cuDataType *)&alpha, a_, - lda, (cuDataType *)&beta, b_, ldb, c_, ldc); + get_cublas_operation(transb), m, n, (cuDataType*)&alpha, a_, + lda, (cuDataType*)&beta, b_, ldb, c_, ldc); }); }); return done; } #define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, \ - const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, \ + const TYPE* b, int64_t ldb, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \ lda, beta, b, ldb, c, ldc, dependencies); \ } @@ -392,88 +392,88 @@ namespace row_major { // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } template -void omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, +void omatcopy(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m); const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), get_cublas_operation(trans), logical_m, logical_n, - (cuDataType *)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); + (cuDataType*)&alpha, a_, lda, nullptr, nullptr, lda, b_, ldb); }); }); } #define OMATCOPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { \ omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \ } @@ -485,16 +485,16 @@ OMATCOPY_LAUNCHER(std::complex, cublasZgeam) #undef OMATCOPY_LAUNCHER template -void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb) { \ + void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb) { \ omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, b, \ ldb, strideb); \ } @@ -505,53 +505,53 @@ OMATCOPY2_LAUNCHER(std::complex, "unimplemented") OMATCOPY2_LAUNCHER(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } template -void omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - T beta, sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + T beta, sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_, - lda, (cuDataType *)&beta, b_, ldb, c_, ldc); + get_cublas_operation(transb), n, m, (cuDataType*)&alpha, a_, + lda, (cuDataType*)&beta, b_, ldb, c_, ldc); }); }); } #define OMATADD_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { \ + void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { \ omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, \ b, ldb, c, ldc); \ } @@ -565,95 +565,95 @@ OMATADD_LAUNCHER(std::complex, cublasZgeam) // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } template -sycl::event omatcopy(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); const int64_t logical_m = (trans == oneapi::mkl::transpose::nontrans ? n : m); const int64_t logical_n = (trans == oneapi::mkl::transpose::nontrans ? m : n); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), get_cublas_operation(trans), logical_m, logical_n, - (cuDataType *)&alpha, a_, lda, nullptr, nullptr, ldb, b_, ldb); + (cuDataType*)&alpha, a_, lda, nullptr, nullptr, ldb, b_, ldb); }); }); return done; } #define OMATCOPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ - const std::vector &dependencies) { \ + sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, TYPE* b, int64_t ldb, \ + const std::vector& dependencies) { \ return omatcopy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, \ ldb, dependencies); \ } @@ -666,16 +666,16 @@ OMATCOPY_LAUNCHER_USM(std::complex, cublasZgeam) #undef OMATCOPY_LAUNCHER_USM template -sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b, - int64_t ldb, int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, int64_t stridea, T* b, + int64_t ldb, int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, \ - int64_t strideb, const std::vector &dependencies) { \ + sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, \ + int64_t strideb, const std::vector& dependencies) { \ return omatcopy2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \ lda, b, ldb, strideb, dependencies); \ } @@ -686,58 +686,58 @@ OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER_USM -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } template -inline sycl::event omatadd(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - T beta, const T *b, int64_t ldb, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event omatadd(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + T beta, const T* b, int64_t ldb, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), n, m, (cuDataType *)&alpha, a_, - lda, (cuDataType *)&beta, b_, ldb, c_, ldc); + get_cublas_operation(transb), n, m, (cuDataType*)&alpha, a_, + lda, (cuDataType*)&beta, b_, ldb, c_, ldc); }); }); return done; } #define OMATADD_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, \ - const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, \ + const TYPE* b, int64_t ldb, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return omatadd(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, \ lda, beta, b, ldb, c, ldc, dependencies); \ } diff --git a/src/blas/backends/cublas/cublas_handle.hpp b/src/blas/backends/cublas/cublas_handle.hpp index db9df5584..83a76c927 100644 --- a/src/blas/backends/cublas/cublas_handle.hpp +++ b/src/blas/backends/cublas/cublas_handle.hpp @@ -28,10 +28,10 @@ namespace cublas { template struct cublas_handle { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t cublas_handle_mapper_{}; ~cublas_handle() noexcept(false) { - for (auto &handle_pair : cublas_handle_mapper_) { + for (auto& handle_pair : cublas_handle_mapper_) { cublasStatus_t err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); diff --git a/src/blas/backends/cublas/cublas_helper.hpp b/src/blas/backends/cublas/cublas_helper.hpp index 0fe7e7c5a..44b166a4e 100644 --- a/src/blas/backends/cublas/cublas_helper.hpp +++ b/src/blas/backends/cublas/cublas_helper.hpp @@ -81,7 +81,7 @@ void overflow_check(Index index, Next... indices) { class cublas_error : virtual public std::runtime_error { protected: - inline const char *cublas_error_map(cublasStatus_t error) { + inline const char* cublas_error_map(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; @@ -133,7 +133,7 @@ class cublas_error : virtual public std::runtime_error { class cuda_error : virtual public std::runtime_error { protected: - inline const char *cuda_error_map(CUresult result) { + inline const char* cuda_error_map(CUresult result) { switch (result) { case CUDA_SUCCESS: return "CUDA_SUCCESS"; case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED"; diff --git a/src/blas/backends/cublas/cublas_level1.cpp b/src/blas/backends/cublas/cublas_level1.cpp index 5f7087727..7bc9cb780 100644 --- a/src/blas/backends/cublas/cublas_level1.cpp +++ b/src/blas/backends/cublas/cublas_level1.cpp @@ -32,16 +32,16 @@ namespace column_major { // Level 1 template -inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void asum(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -49,8 +49,8 @@ inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // ASUM does not support negative index CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_); @@ -63,8 +63,8 @@ inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n } #define ASUM_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } ASUM_LAUNCHER(float, float, cublasSasum) @@ -74,26 +74,26 @@ ASUM_LAUNCHER(std::complex, double, cublasDzasum) #undef ASUM_LAUNCHER template -inline void scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a, - sycl::buffer &x, int64_t incx) { +inline void scal(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1 a, + sycl::buffer& x, int64_t incx) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; // SCAL does not support negative incx - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1 *)&a, x_, + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1*)&a, x_, std::abs(incx)); }); }); } #define SCAL_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer &x, int64_t incx) { \ + void scal(sycl::queue& queue, int64_t n, TYPE1 a, sycl::buffer& x, int64_t incx) { \ scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx); \ } SCAL_LAUNCHER(float, float, cublasSscal) @@ -105,27 +105,27 @@ SCAL_LAUNCHER(double, std::complex, cublasZdscal) #undef SCAL_LAUNCHER template -inline void axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void axpy(const char* func_name, Func func, sycl::queue& queue, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType *)&alpha, x_, - incx, y_, incy); + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType*)&alpha, x_, incx, + y_, incy); }); }); } #define AXPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void axpy(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy); \ } @@ -135,39 +135,39 @@ AXPY_LAUNCHER(std::complex, cublasCaxpy) AXPY_LAUNCHER(std::complex, cublasZaxpy) #undef AXPY_LAUNCHER -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } template -inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +inline void rotg(const char* func_name, Func func, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -175,10 +175,10 @@ inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buf // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); - auto s_ = sc.get_mem(s_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); + auto s_ = sc.get_mem(s_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, a_, b_, c_, s_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -190,8 +190,8 @@ inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buf } #define ROTG_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, \ - sycl::buffer &c, sycl::buffer &s) { \ + void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, \ + sycl::buffer& c, sycl::buffer& s) { \ rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s); \ } @@ -202,16 +202,16 @@ ROTG_LAUNCHER(std::complex, double, cublasZrotg) #undef ROTG_LAUNCHER template -inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer ¶m) { +inline void rotm(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& param) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -219,9 +219,9 @@ inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto param_ = sc.get_mem(param_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -233,8 +233,8 @@ inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n } #define ROTM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { \ + void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy, sycl::buffer& param) { \ rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param); \ } @@ -243,17 +243,17 @@ ROTM_LAUNCHER(double, cublasDrotm) #undef ROTM_LAUNCHER template -inline void copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void copy(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy); }); @@ -261,8 +261,8 @@ inline void copy(const char *func_name, Func func, sycl::queue &queue, int64_t n } #define COPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -273,16 +273,16 @@ COPY_LAUNCHER(std::complex, cublasZcopy) #undef COPY_LAUNCHER template -inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &result) { +inline void dot(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& result) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -290,9 +290,9 @@ inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -304,8 +304,8 @@ inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, } #define DOT_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE) \ - void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, const int64_t incy, sycl::buffer &result) { \ + void dot##EXT(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, const int64_t incy, sycl::buffer& result) { \ dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result); \ } DOT_LAUNCHER(, float, cublasSdot) @@ -317,17 +317,17 @@ DOT_LAUNCHER(u, std::complex, cublasZdotu) #undef DOT_LAUNCHER template -inline void rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &y, int64_t incy, +inline void rot(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& y, int64_t incy, T2 c, T3 s) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; using cuDataType3 = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -335,18 +335,18 @@ inline void rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. // cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, - (cuDataType2 *)&c, (cuDataType3 *)&s); + (cuDataType2*)&c, (cuDataType3*)&s); }); }); } #define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE) \ - void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, int64_t incy, TYPE2 c, TYPE3 s) { \ + void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, int64_t incy, TYPE2 c, TYPE3 s) { \ rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s); \ } @@ -356,15 +356,15 @@ ROT_LAUNCHER(std::complex, float, float, cublasCsrot) ROT_LAUNCHER(std::complex, double, double, cublasZdrot) #undef ROT_LAUNCHER -void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { overflow_check(n, incx, incy); // cuBLAS does not support sdot so we need to mimic sdot. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.get_access(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -372,9 +372,9 @@ void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_SYNC(cublasSdot, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -388,23 +388,23 @@ void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, result.get_host_access(sycl::read_write)[0] += sb; } -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "dot", "for column_major layout"); } template -inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, T y1, sycl::buffer ¶m) { +inline void rotmg(const char* func_name, Func func, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, T y1, sycl::buffer& param) { using cuDataType = typename CudaEquivalentType::Type; sycl::buffer y1_buff(&y1, sycl::range<1>(1)); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto d1_acc = d1.template get_access(cgh); auto d2_acc = d2.template get_access(cgh); auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -412,11 +412,11 @@ inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::bu // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto d1_ = sc.get_mem(d1_acc); - auto d2_ = sc.get_mem(d2_acc); - auto x1_ = sc.get_mem(x1_acc); - auto y1_ = sc.get_mem(y1_acc); - auto param_ = sc.get_mem(param_acc); + auto d1_ = sc.get_mem(d1_acc); + auto d2_ = sc.get_mem(d2_acc); + auto x1_ = sc.get_mem(x1_acc); + auto y1_ = sc.get_mem(y1_acc); + auto param_ = sc.get_mem(param_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_, param_); // Higher level BLAS functions expect CUBLAS_POINTER_MODE_HOST @@ -428,8 +428,8 @@ inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::bu } #define ROTMG_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, \ - sycl::buffer &x1, TYPE y1, sycl::buffer ¶m) { \ + void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, \ + sycl::buffer& x1, TYPE y1, sycl::buffer& param) { \ rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param); \ } @@ -438,8 +438,8 @@ ROTMG_LAUNCHER(double, cublasDrotmg) #undef ROTMG_LAUNCHER template -inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void iamax(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); // cuBLAS does not support int64_t as return type for the data. So we need to @@ -450,10 +450,10 @@ inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t // to elementwise copy the data between two buffer, or allow reinterpret cast // to convert to different type with different typesize size. sycl::buffer int_res_buff{ sycl::range<1>(1) }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -461,8 +461,8 @@ inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto int_res_ = sc.get_mem(int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamax returns 0. This behaviour is similar to that of // reference netlib BLAS. @@ -474,7 +474,7 @@ inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto int_res_acc = int_res_buff.template get_access(cgh); auto result_acc = result.template get_access(cgh); cgh.single_task( @@ -483,8 +483,8 @@ inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t } #define IAMAX_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } IAMAX_LAUNCHER(float, cublasIsamax) @@ -494,17 +494,17 @@ IAMAX_LAUNCHER(std::complex, cublasIzamax) #undef IAMAX_LAUNCHER template -inline void swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void swap(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy); }); @@ -512,8 +512,8 @@ inline void swap(const char *func_name, Func func, sycl::queue &queue, int64_t n } #define SWAP_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -524,8 +524,8 @@ SWAP_LAUNCHER(std::complex, cublasZswap) #undef SWAP_LAUNCHER template -inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void iamin(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); // cuBLAS does not support int64_t as return type for the data. So we need to @@ -536,10 +536,10 @@ inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t // to elementwise copy the data between two buffer, or allow reinterpret cast // to convert to different type with different typesize size. sycl::buffer int_res_buff{ sycl::range<1>(1) }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -547,8 +547,8 @@ inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto int_res_ = sc.get_mem(int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); cublasStatus_t err; // For negative incx, iamin returns 0. This behaviour is similar to that of // implemented as a reference IAMIN. @@ -560,7 +560,7 @@ inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto int_res_acc = int_res_buff.template get_access(cgh); auto result_acc = result.template get_access(cgh); cgh.single_task( @@ -569,8 +569,8 @@ inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t } #define IAMIN_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } IAMIN_LAUNCHER(float, cublasIsamin) @@ -580,16 +580,16 @@ IAMIN_LAUNCHER(std::complex, cublasIzamin) #undef IAMIN_LAUNCHER template -inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void nrm2(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the CUBLAS_POINTER_MODE_HOST // when the data is on buffer, it must be set to @@ -597,8 +597,8 @@ inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); - auto x_ = sc.get_mem(x_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); cublasStatus_t err; // NRM2 does not support negative index CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, std::abs(incx), res_); @@ -611,8 +611,8 @@ inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n } #define NRM2_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } NRM2_LAUNCHER(float, float, cublasSnrm2) @@ -625,24 +625,24 @@ NRM2_LAUNCHER(std::complex, double, cublasDznrm2) // Level 1 template -inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T1 *x, const int64_t incx, T2 *result, - const std::vector &dependencies) { +inline sycl::event asum(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T1* x, const int64_t incx, T2* result, + const std::vector& dependencies) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto res_ = reinterpret_cast(result); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -658,8 +658,8 @@ inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, in } #define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event asum(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } ASUM_LAUNCHER_USM(float, float, cublasSasum) @@ -669,22 +669,22 @@ ASUM_LAUNCHER_USM(std::complex, double, cublasDzasum) #undef ASUM_LAUNCHER_USM template -inline sycl::event scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a, - T2 *x, int64_t incx, const std::vector &dependencies) { +inline sycl::event scal(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1 a, + T2* x, int64_t incx, const std::vector& dependencies) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); + auto x_ = reinterpret_cast(x); cublasStatus_t err; // SCAL does not support negative incx - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1 *)&a, x_, + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType1*)&a, x_, std::abs(incx)); }); }); @@ -692,8 +692,8 @@ inline sycl::event scal(const char *func_name, Func func, sycl::queue &queue, in } #define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event scal(sycl::queue& queue, int64_t n, TYPE1 a, TYPE2* x, int64_t incx, \ + const std::vector& dependencies) { \ return scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \ } SCAL_LAUNCHER_USM(float, float, cublasSscal) @@ -705,31 +705,31 @@ SCAL_LAUNCHER_USM(double, std::complex, cublasZdscal) #undef SCAL_LAUNCHER_USM template -inline sycl::event axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha, - const T *x, int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event axpy(const char* func_name, Func func, sycl::queue& queue, int64_t n, T alpha, + const T* x, int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType *)&alpha, x_, - incx, y_, incy); + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, (cuDataType*)&alpha, x_, incx, + y_, incy); }); }); return done; } #define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - TYPE *y, int64_t incy, const std::vector &dependencies) { \ + sycl::event axpy(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + TYPE* y, int64_t incy, const std::vector& dependencies) { \ return axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, \ dependencies); \ } @@ -740,32 +740,32 @@ AXPY_LAUNCHER_USM(std::complex, cublasCaxpy) AXPY_LAUNCHER_USM(std::complex, cublasZaxpy) #undef AXPY_LAUNCHER_USM -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - float beta, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } template -inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, - T1 *s, const std::vector &dependencies) { +inline sycl::event rotg(const char* func_name, Func func, sycl::queue& queue, T1* a, T1* b, T2* c, + T1* s, const std::vector& dependencies) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; auto ctx = queue.get_context(); @@ -783,17 +783,17 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 "If any pointer is only device accessible, all must be device accessible"); } } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); - auto s_ = reinterpret_cast(s); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + auto s_ = reinterpret_cast(s); if (results_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -808,8 +808,8 @@ inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 } #define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \ - const std::vector &dependencies) { \ + sycl::event rotg(sycl::queue& queue, TYPE1* a, TYPE1* b, TYPE2* c, TYPE1* s, \ + const std::vector& dependencies) { \ return rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s, dependencies); \ } @@ -820,21 +820,21 @@ ROTG_LAUNCHER_USM(std::complex, double, cublasZrotg) #undef ROTG_LAUNCHER_USM template -inline sycl::event rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x, - int64_t incx, T *y, int64_t incy, T *param, - const std::vector &dependencies) { +inline sycl::event rotm(const char* func_name, Func func, sycl::queue& queue, int64_t n, T* x, + int64_t incx, T* y, int64_t incy, T* param, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto param_ = reinterpret_cast(param); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto param_ = reinterpret_cast(param); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, param_); }); @@ -843,8 +843,8 @@ inline sycl::event rotm(const char *func_name, Func func, sycl::queue &queue, in } #define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - TYPE *param, const std::vector &dependencies) { \ + sycl::event rotm(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + TYPE* param, const std::vector& dependencies) { \ return rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param, \ dependencies); \ } @@ -854,20 +854,20 @@ ROTM_LAUNCHER_USM(double, cublasDrotm) #undef ROTM_LAUNCHER_USM template -inline sycl::event copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x, - int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event copy(const char* func_name, Func func, sycl::queue& queue, int64_t n, const T* x, + int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy); }); @@ -876,8 +876,8 @@ inline sycl::event copy(const char *func_name, Func func, sycl::queue &queue, in } #define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event copy(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -888,23 +888,23 @@ COPY_LAUNCHER_USM(std::complex, cublasZcopy) #undef COPY_LAUNCHER_USM template -inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x, - const int64_t incx, const T *y, int64_t incy, T *result, - const std::vector &dependencies) { +inline sycl::event dot(const char* func_name, Func func, sycl::queue& queue, int64_t n, const T* x, + const int64_t incx, const T* y, int64_t incy, T* result, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto res_ = reinterpret_cast(result); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -919,9 +919,9 @@ inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int } #define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ - sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - const TYPE *y, const int64_t incy, TYPE *result, \ - const std::vector &dependencies) { \ + sycl::event dot##EXT(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + const TYPE* y, const int64_t incy, TYPE* result, \ + const std::vector& dependencies) { \ return dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result, \ dependencies); \ } @@ -934,34 +934,34 @@ DOT_LAUNCHER_USM(u, std::complex, cublasZdotu) #undef DOT_LAUNCHER_USM template -inline sycl::event rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 *x, - const int64_t incx, T1 *y, int64_t incy, T2 c, T3 s, - const std::vector &dependencies) { +inline sycl::event rot(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1* x, + const int64_t incx, T1* y, int64_t incy, T2 c, T3 s, + const std::vector& dependencies) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; using cuDataType3 = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy, - (cuDataType2 *)&c, (cuDataType3 *)&s); + (cuDataType2*)&c, (cuDataType3*)&s); }); }); return done; } #define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE) \ - sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \ + sycl::event rot(sycl::queue& queue, int64_t n, TYPE1* x, const int64_t incx, TYPE1* y, \ int64_t incy, TYPE2 c, TYPE3 s, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, \ dependencies); \ } @@ -972,23 +972,23 @@ ROT_LAUNCHER_USM(std::complex, float, float, cublasCsrot) ROT_LAUNCHER_USM(std::complex, double, double, cublasZdrot) #undef ROT_LAUNCHER_USM -sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, - const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { overflow_check(n, incx, incy); bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; // cuBLAS does not support sdsdot so we need to mimic sdot. - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto res_ = reinterpret_cast(result); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -1017,14 +1017,14 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6 } } -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies) { throw unimplemented("blas", "dot", "for column_major layout"); } template -inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1, - T y1, T *param, const std::vector &dependencies) { +inline sycl::event rotmg(const char* func_name, Func func, sycl::queue& queue, T* d1, T* d2, T* x1, + T y1, T* param, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; auto ctx = queue.get_context(); bool results_on_device = (sycl::get_pointer_type(d1, ctx) == sycl::usm::alloc::device || @@ -1039,22 +1039,22 @@ inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T "If any pointer is only device accessible, all must be device accessible"); } } - cuDataType *y1_; + cuDataType* y1_; if (results_on_device) { y1_ = sycl::malloc_device(1, queue); queue.memcpy(y1_, &y1, sizeof(cuDataType)).wait(); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto d1_ = reinterpret_cast(d1); - auto d2_ = reinterpret_cast(d2); - auto x1_ = reinterpret_cast(x1); - auto param_ = reinterpret_cast(param); + auto d1_ = reinterpret_cast(d1); + auto d2_ = reinterpret_cast(d2); + auto x1_ = reinterpret_cast(x1); + auto param_ = reinterpret_cast(param); cublasStatus_t err; if (results_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); @@ -1062,7 +1062,7 @@ inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); } else { - auto y1_c = reinterpret_cast(&y1); + auto y1_c = reinterpret_cast(&y1); CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, d1_, d2_, x1_, y1_c, param_); } }); @@ -1076,8 +1076,8 @@ inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T } #define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \ - const std::vector &dependencies) { \ + sycl::event rotmg(sycl::queue& queue, TYPE* d1, TYPE* d2, TYPE* x1, TYPE y1, TYPE* param, \ + const std::vector& dependencies) { \ return rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \ } @@ -1086,9 +1086,9 @@ ROTMG_LAUNCHER_USM(double, cublasDrotmg) #undef ROTMG_LAUNCHER_USM template -inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T *x, const int64_t incx, int64_t *result, - const std::vector &dependencies) { +inline sycl::event iamax(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T* x, const int64_t incx, int64_t* result, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); // cuBLAS does not support int64_t as return type for the data. So we need to @@ -1097,7 +1097,7 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i // This change may cause failure as the result of integer overflow // based on the size. int int_res = 0; - int *int_res_p = nullptr; + int* int_res_p = nullptr; bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; if (result_on_device) { @@ -1106,14 +1106,14 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i else { int_res_p = &int_res; } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); + auto x_ = reinterpret_cast(x); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -1128,7 +1128,7 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i }); done.wait(); if (result_on_device) { - auto last_ev = queue.submit([&](sycl::handler &cgh) { + auto last_ev = queue.submit([&](sycl::handler& cgh) { cgh.single_task([=]() { *result = std::max((int64_t)*int_res_p - 1, (int64_t)0); }); }); last_ev.wait(); @@ -1142,8 +1142,8 @@ inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, i } #define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamax(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } IAMAX_LAUNCHER_USM(float, cublasIsamax) @@ -1153,20 +1153,20 @@ IAMAX_LAUNCHER_USM(std::complex, cublasIzamax) #undef IAMAX_LAUNCHER_USM template -inline sycl::event swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x, - int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event swap(const char* func_name, Func func, sycl::queue& queue, int64_t n, T* x, + int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, n, x_, incx, y_, incy); }); @@ -1175,8 +1175,8 @@ inline sycl::event swap(const char *func_name, Func func, sycl::queue &queue, in } #define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event swap(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1187,9 +1187,9 @@ SWAP_LAUNCHER_USM(std::complex, cublasZswap) #undef SWAP_LAUNCHER_USM template -inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T *x, const int64_t incx, int64_t *result, - const std::vector &dependencies) { +inline sycl::event iamin(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T* x, const int64_t incx, int64_t* result, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); // cuBLAS does not support int64_t as return type for the data. So we need to @@ -1198,7 +1198,7 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i // This change may cause failure as the result of integer overflow // based on the size. int int_res = 0; - int *int_res_p = nullptr; + int* int_res_p = nullptr; bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; if (result_on_device) { @@ -1207,14 +1207,14 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i else { int_res_p = &int_res; } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); + auto x_ = reinterpret_cast(x); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -1229,7 +1229,7 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i }); done.wait(); if (result_on_device) { - auto last_ev = queue.submit([&](sycl::handler &cgh) { + auto last_ev = queue.submit([&](sycl::handler& cgh) { cgh.single_task([=]() { *result = std::max((int64_t)*int_res_p - 1, (int64_t)0); }); }); last_ev.wait(); @@ -1243,8 +1243,8 @@ inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, i } #define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamin(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } IAMIN_LAUNCHER_USM(float, cublasIsamin) @@ -1254,24 +1254,24 @@ IAMIN_LAUNCHER_USM(std::complex, cublasIzamin) #undef IAMIN_LAUNCHER_USM template -inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T1 *x, const int64_t incx, T2 *result, - const std::vector &dependencies) { +inline sycl::event nrm2(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T1* x, const int64_t incx, T2* result, + const std::vector& dependencies) { using cuDataType1 = typename CudaEquivalentType::Type; using cuDataType2 = typename CudaEquivalentType::Type; overflow_check(n, incx); bool result_on_device = sycl::get_pointer_type(result, queue.get_context()) == sycl::usm::alloc::device; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto res_ = reinterpret_cast(result); if (result_on_device) { cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE); } @@ -1287,8 +1287,8 @@ inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, in } #define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event nrm2(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } NRM2_LAUNCHER_USM(float, float, cublasSnrm2) @@ -1304,14 +1304,14 @@ namespace row_major { // Level 1 template -inline void asum(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void asum(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "asum", "for row_major layout"); } #define ASUM_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } ASUM_LAUNCHER(float, float, cublasSasum) @@ -1321,13 +1321,13 @@ ASUM_LAUNCHER(std::complex, double, cublasDzasum) #undef ASUM_LAUNCHER template -inline void scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a, - sycl::buffer &x, int64_t incx) { +inline void scal(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1 a, + sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "scal", "for row_major layout"); } #define SCAL_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer &x, int64_t incx) { \ + void scal(sycl::queue& queue, int64_t n, TYPE1 a, sycl::buffer& x, int64_t incx) { \ scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx); \ } SCAL_LAUNCHER(float, float, cublasSscal) @@ -1339,14 +1339,14 @@ SCAL_LAUNCHER(double, std::complex, cublasZdscal) #undef SCAL_LAUNCHER template -inline void axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void axpy(const char* func_name, Func func, sycl::queue& queue, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpy", "for row_major layout"); } #define AXPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void axpy(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy); \ } @@ -1356,37 +1356,37 @@ AXPY_LAUNCHER(std::complex, cublasCaxpy) AXPY_LAUNCHER(std::complex, cublasZaxpy) #undef AXPY_LAUNCHER -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } template -inline void rotg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +inline void rotg(const char* func_name, Func func, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { throw unimplemented("blas", "rotg", "for row_major layout"); } #define ROTG_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, \ - sycl::buffer &c, sycl::buffer &s) { \ + void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, \ + sycl::buffer& c, sycl::buffer& s) { \ rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s); \ } @@ -1397,15 +1397,15 @@ ROTG_LAUNCHER(std::complex, double, cublasZrotg) #undef ROTG_LAUNCHER template -inline void rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer ¶m) { +inline void rotm(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& param) { throw unimplemented("blas", "rotm", "for row_major layout"); } #define ROTM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { \ + void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy, sycl::buffer& param) { \ rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param); \ } @@ -1414,14 +1414,14 @@ ROTM_LAUNCHER(double, cublasDrotm) #undef ROTM_LAUNCHER template -inline void copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void copy(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "copy", "for row_major layout"); } #define COPY_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -1432,15 +1432,15 @@ COPY_LAUNCHER(std::complex, cublasZcopy) #undef COPY_LAUNCHER template -inline void dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &result) { +inline void dot(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& result) { throw unimplemented("blas", "dot", "for row_major layout"); } #define DOT_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE) \ - void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, const int64_t incy, sycl::buffer &result) { \ + void dot##EXT(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, const int64_t incy, sycl::buffer& result) { \ dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result); \ } DOT_LAUNCHER(, float, cublasSdot) @@ -1452,15 +1452,15 @@ DOT_LAUNCHER(u, std::complex, cublasZdotu) #undef DOT_LAUNCHER template -inline void rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &y, int64_t incy, +inline void rot(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& y, int64_t incy, T2 c, T3 s) { throw unimplemented("blas", "rot", "for row_major layout"); } #define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE) \ - void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, int64_t incy, TYPE2 c, TYPE3 s) { \ + void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, int64_t incy, TYPE2 c, TYPE3 s) { \ rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s); \ } @@ -1470,25 +1470,25 @@ ROT_LAUNCHER(std::complex, float, float, cublasCsrot) ROT_LAUNCHER(std::complex, double, double, cublasZdrot) #undef ROT_LAUNCHER -void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "sdsdot", "for row_major layout"); } -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "dot", "for row_major layout"); } template -inline void rotmg(const char *func_name, Func func, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, T y1, sycl::buffer ¶m) { +inline void rotmg(const char* func_name, Func func, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, T y1, sycl::buffer& param) { throw unimplemented("blas", "rotmg", "for row_major layout"); } #define ROTMG_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, \ - sycl::buffer &x1, TYPE y1, sycl::buffer ¶m) { \ + void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, \ + sycl::buffer& x1, TYPE y1, sycl::buffer& param) { \ rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param); \ } @@ -1497,14 +1497,14 @@ ROTMG_LAUNCHER(double, cublasDrotmg) #undef ROTMG_LAUNCHER template -inline void iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void iamax(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "iamax", "for row_major layout"); } #define IAMAX_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } IAMAX_LAUNCHER(float, cublasIsamax) @@ -1514,14 +1514,14 @@ IAMAX_LAUNCHER(std::complex, cublasIzamax) #undef IAMAX_LAUNCHER template -inline void swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy) { +inline void swap(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "swap", "for row_major layout"); } #define SWAP_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -1532,14 +1532,14 @@ SWAP_LAUNCHER(std::complex, cublasZswap) #undef SWAP_LAUNCHER template -inline void iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void iamin(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "iamin", "for row_major layout"); } #define IAMIN_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } IAMIN_LAUNCHER(float, cublasIsamin) @@ -1549,14 +1549,14 @@ IAMIN_LAUNCHER(std::complex, cublasIzamin) #undef IAMIN_LAUNCHER template -inline void nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n, - sycl::buffer &x, const int64_t incx, sycl::buffer &result) { +inline void nrm2(const char* func_name, Func func, sycl::queue& queue, int64_t n, + sycl::buffer& x, const int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "nrm2", "for row_major layout"); } #define NRM2_LAUNCHER(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result); \ } NRM2_LAUNCHER(float, float, cublasSnrm2) @@ -1569,15 +1569,15 @@ NRM2_LAUNCHER(std::complex, double, cublasDznrm2) // Level 1 template -inline sycl::event asum(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T1 *x, const int64_t incx, T2 *result, - const std::vector &dependencies) { +inline sycl::event asum(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T1* x, const int64_t incx, T2* result, + const std::vector& dependencies) { throw unimplemented("blas", "asum", "for row_major layout"); } #define ASUM_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event asum(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return asum(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } ASUM_LAUNCHER_USM(float, float, cublasSasum) @@ -1587,14 +1587,14 @@ ASUM_LAUNCHER_USM(std::complex, double, cublasDzasum) #undef ASUM_LAUNCHER_USM template -inline sycl::event scal(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 a, - T2 *x, int64_t incx, const std::vector &dependencies) { +inline sycl::event scal(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1 a, + T2* x, int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "scal", "for row_major layout"); } #define SCAL_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event scal(sycl::queue& queue, int64_t n, TYPE1 a, TYPE2* x, int64_t incx, \ + const std::vector& dependencies) { \ return scal(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \ } SCAL_LAUNCHER_USM(float, float, cublasSscal) @@ -1606,15 +1606,15 @@ SCAL_LAUNCHER_USM(double, std::complex, cublasZdscal) #undef SCAL_LAUNCHER_USM template -inline sycl::event axpy(const char *func_name, Func func, sycl::queue &queue, int64_t n, T alpha, - const T *x, int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event axpy(const char* func_name, Func func, sycl::queue& queue, int64_t n, T alpha, + const T* x, int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpy", "for row_major layout"); } #define AXPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - TYPE *y, int64_t incy, const std::vector &dependencies) { \ + sycl::event axpy(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + TYPE* y, int64_t incy, const std::vector& dependencies) { \ return axpy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, \ dependencies); \ } @@ -1625,38 +1625,38 @@ AXPY_LAUNCHER_USM(std::complex, cublasCaxpy) AXPY_LAUNCHER_USM(std::complex, cublasZaxpy) #undef AXPY_LAUNCHER_USM -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - float beta, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } template -inline sycl::event rotg(const char *func_name, Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, - T1 *s, const std::vector &dependencies) { +inline sycl::event rotg(const char* func_name, Func func, sycl::queue& queue, T1* a, T1* b, T2* c, + T1* s, const std::vector& dependencies) { throw unimplemented("blas", "rotg", "for row_major layout"); } #define ROTG_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \ - const std::vector &dependencies) { \ + sycl::event rotg(sycl::queue& queue, TYPE1* a, TYPE1* b, TYPE2* c, TYPE1* s, \ + const std::vector& dependencies) { \ return rotg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, a, b, c, s, dependencies); \ } @@ -1667,15 +1667,15 @@ ROTG_LAUNCHER_USM(std::complex, double, cublasZrotg) #undef ROTG_LAUNCHER_USM template -inline sycl::event rotm(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x, - int64_t incx, T *y, int64_t incy, T *param, - const std::vector &dependencies) { +inline sycl::event rotm(const char* func_name, Func func, sycl::queue& queue, int64_t n, T* x, + int64_t incx, T* y, int64_t incy, T* param, + const std::vector& dependencies) { throw unimplemented("blas", "rotm", "for row_major layout"); } #define ROTM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - TYPE *param, const std::vector &dependencies) { \ + sycl::event rotm(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + TYPE* param, const std::vector& dependencies) { \ return rotm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, param, \ dependencies); \ } @@ -1685,15 +1685,15 @@ ROTM_LAUNCHER_USM(double, cublasDrotm) #undef ROTM_LAUNCHER_USM template -inline sycl::event copy(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x, - int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event copy(const char* func_name, Func func, sycl::queue& queue, int64_t n, const T* x, + int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "copy", "for row_major layout"); } #define COPY_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event copy(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return copy(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1704,16 +1704,16 @@ COPY_LAUNCHER_USM(std::complex, cublasZcopy) #undef COPY_LAUNCHER_USM template -inline sycl::event dot(const char *func_name, Func func, sycl::queue &queue, int64_t n, const T *x, - const int64_t incx, const T *y, int64_t incy, T *result, - const std::vector &dependencies) { +inline sycl::event dot(const char* func_name, Func func, sycl::queue& queue, int64_t n, const T* x, + const int64_t incx, const T* y, int64_t incy, T* result, + const std::vector& dependencies) { throw unimplemented("blas", "dot", "for row_major layout"); } #define DOT_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ - sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - const TYPE *y, const int64_t incy, TYPE *result, \ - const std::vector &dependencies) { \ + sycl::event dot##EXT(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + const TYPE* y, const int64_t incy, TYPE* result, \ + const std::vector& dependencies) { \ return dot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, result, \ dependencies); \ } @@ -1726,16 +1726,16 @@ DOT_LAUNCHER_USM(u, std::complex, cublasZdotu) #undef DOT_LAUNCHER_USM template -inline sycl::event rot(const char *func_name, Func func, sycl::queue &queue, int64_t n, T1 *x, - const int64_t incx, T1 *y, int64_t incy, T2 c, T3 s, - const std::vector &dependencies) { +inline sycl::event rot(const char* func_name, Func func, sycl::queue& queue, int64_t n, T1* x, + const int64_t incx, T1* y, int64_t incy, T2 c, T3 s, + const std::vector& dependencies) { throw unimplemented("blas", "rot", "for row_major layout"); } #define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, CUBLAS_ROUTINE) \ - sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \ + sycl::event rot(sycl::queue& queue, int64_t n, TYPE1* x, const int64_t incx, TYPE1* y, \ int64_t incy, TYPE2 c, TYPE3 s, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return rot(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, \ dependencies); \ } @@ -1746,26 +1746,26 @@ ROT_LAUNCHER_USM(std::complex, float, float, cublasCsrot) ROT_LAUNCHER_USM(std::complex, double, double, cublasZdrot) #undef ROT_LAUNCHER_USM -sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, - const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { throw unimplemented("blas", "sdsdot", "for row_major layout"); } -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies) { throw unimplemented("blas", "dot", "for row_major layout"); } template -inline sycl::event rotmg(const char *func_name, Func func, sycl::queue &queue, T *d1, T *d2, T *x1, - T y1, T *param, const std::vector &dependencies) { +inline sycl::event rotmg(const char* func_name, Func func, sycl::queue& queue, T* d1, T* d2, T* x1, + T y1, T* param, const std::vector& dependencies) { throw unimplemented("blas", "rotmg", "for row_major layout"); } #define ROTMG_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \ - const std::vector &dependencies) { \ + sycl::event rotmg(sycl::queue& queue, TYPE* d1, TYPE* d2, TYPE* x1, TYPE y1, TYPE* param, \ + const std::vector& dependencies) { \ return rotmg(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \ } @@ -1774,15 +1774,15 @@ ROTMG_LAUNCHER_USM(double, cublasDrotmg) #undef ROTMG_LAUNCHER_USM template -inline sycl::event iamax(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T *x, const int64_t incx, int64_t *result, - const std::vector &dependencies) { +inline sycl::event iamax(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T* x, const int64_t incx, int64_t* result, + const std::vector& dependencies) { throw unimplemented("blas", "iamax", "for row_major layout"); } #define IAMAX_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamax(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamax(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } IAMAX_LAUNCHER_USM(float, cublasIsamax) @@ -1792,15 +1792,15 @@ IAMAX_LAUNCHER_USM(std::complex, cublasIzamax) #undef IAMAX_LAUNCHER_USM template -inline sycl::event swap(const char *func_name, Func func, sycl::queue &queue, int64_t n, T *x, - int64_t incx, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event swap(const char* func_name, Func func, sycl::queue& queue, int64_t n, T* x, + int64_t incx, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "swap", "for row_major layout"); } #define SWAP_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event swap(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return swap(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1811,15 +1811,15 @@ SWAP_LAUNCHER_USM(std::complex, cublasZswap) #undef SWAP_LAUNCHER_USM template -inline sycl::event iamin(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T *x, const int64_t incx, int64_t *result, - const std::vector &dependencies) { +inline sycl::event iamin(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T* x, const int64_t incx, int64_t* result, + const std::vector& dependencies) { throw unimplemented("blas", "iamin", "for row_major layout"); } #define IAMIN_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamin(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamin(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } IAMIN_LAUNCHER_USM(float, cublasIsamin) @@ -1829,15 +1829,15 @@ IAMIN_LAUNCHER_USM(std::complex, cublasIzamin) #undef IAMIN_LAUNCHER_USM template -inline sycl::event nrm2(const char *func_name, Func func, sycl::queue &queue, int64_t n, - const T1 *x, const int64_t incx, T2 *result, - const std::vector &dependencies) { +inline sycl::event nrm2(const char* func_name, Func func, sycl::queue& queue, int64_t n, + const T1* x, const int64_t incx, T2* result, + const std::vector& dependencies) { throw unimplemented("blas", "nrm2", "for row_major layout"); } #define NRM2_LAUNCHER_USM(TYPE1, TYPE2, CUBLAS_ROUTINE) \ - sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event nrm2(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return nrm2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } NRM2_LAUNCHER_USM(float, float, cublasSnrm2) diff --git a/src/blas/backends/cublas/cublas_level2.cpp b/src/blas/backends/cublas/cublas_level2.cpp index 8f711243b..b0ef21d53 100644 --- a/src/blas/backends/cublas/cublas_level2.cpp +++ b/src/blas/backends/cublas/cublas_level2.cpp @@ -31,32 +31,32 @@ namespace column_major { // Buffer APIs template -inline void gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gemv(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m, - n, (cuDataType *)&alpha, a_, lda, x_, incx, - (cuDataType *)&beta, y_, incy); + n, (cuDataType*)&alpha, a_, lda, x_, incx, (cuDataType*)&beta, + y_, incy); }); }); } #define GEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \ incy); \ } @@ -68,32 +68,32 @@ GEMV_LAUNCHER(std::complex, cublasZgemv) #undef GEMV_LAUNCHER template -inline void gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gbmv(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx, T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, kl, ku, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m, - n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx, - (cuDataType *)&beta, y_, incy); + n, kl, ku, (cuDataType*)&alpha, a_, lda, x_, incx, + (cuDataType*)&beta, y_, incy); }); }); } #define GBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ - int64_t incx, TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ + int64_t incx, TYPE beta, sycl::buffer& y, int64_t incy) { \ gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -105,30 +105,30 @@ GBMV_LAUNCHER(std::complex, cublasZgbmv) #undef GBMV_LAUNCHER template -inline void ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void ger(const char* func_name, Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType *)&alpha, x_, + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define GER_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE) \ - void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, \ + void ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, \ int64_t lda) { \ ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda); \ } @@ -142,32 +142,32 @@ GER_LAUNCHER(c, std::complex, cublasZgerc) #undef GER_LAUNCHER template -inline void hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void hbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, - a_, lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, k, (cuDataType*)&alpha, + a_, lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define HBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -177,32 +177,32 @@ HBMV_LAUNCHER(std::complex, cublasZhbmv) #undef HBMV_LAUNCHER template -inline void hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - T beta, sycl::buffer &y, int64_t incy) { +inline void hemv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define HEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \ y, incy); \ } @@ -212,31 +212,31 @@ HEMV_LAUNCHER(std::complex, cublasZhemv) #undef HEMV_LAUNCHER template -inline void her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - ScalarType alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &a, int64_t lda) { +inline void her(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + ScalarType alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& a, int64_t lda) { using cuScalarType = typename CudaEquivalentType::Type; using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, + get_cublas_fill_mode(upper_lower), n, (cuScalarType*)&alpha, x_, incx, a_, lda); }); }); } #define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, \ + void her(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, \ int64_t lda) { \ her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -247,32 +247,32 @@ HER_LAUNCHER(double, std::complex, cublasZher) #undef HER_LAUNCHER template -inline void her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void her2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define HER2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ lda); \ } @@ -283,32 +283,32 @@ HER2_LAUNCHER(std::complex, cublasZher2) #undef HER2_LAUNCHER template -inline void hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define HPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \ incy); \ } @@ -319,30 +319,30 @@ HPMV_LAUNCHER(std::complex, cublasZhpmv) #undef HPMV_LAUNCHER template -inline void hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - ScalarType alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &a) { +inline void hpr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + ScalarType alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& a) { using cuScalarType = typename CudaEquivalentType::Type; using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, + get_cublas_fill_mode(upper_lower), n, (cuScalarType*)&alpha, x_, incx, a_); }); }); } #define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -352,32 +352,32 @@ HPR_LAUNCHER(double, std::complex, cublasZhpr) #undef HPR_LAUNCHER template -inline void hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void hpr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_); }); }); } #define HPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -387,32 +387,32 @@ HPR2_LAUNCHER(std::complex, cublasZhpr2) #undef HPR2_LAUNCHER template -inline void sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void sbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, - a_, lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, k, (cuDataType*)&alpha, + a_, lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define SBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -423,32 +423,32 @@ SBMV_LAUNCHER(double, cublasDsbmv) #undef SBMV_LAUNCHER template -inline void symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - T beta, sycl::buffer &y, int64_t incy) { +inline void symv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + T beta, sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define SYMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \ y, incy); \ } @@ -459,28 +459,28 @@ SYMV_LAUNCHER(double, cublasDsymv) #undef SYMV_LAUNCHER template -inline void syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { +inline void syr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, a_, lda); }); }); } #define SYR_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { \ + void syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { \ syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -492,32 +492,32 @@ SYR_LAUNCHER(std::complex, cublasZsyr) #undef SYR_LAUNCHER template -inline void syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void syr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define SYR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ lda); \ } @@ -531,32 +531,32 @@ SYR2_LAUNCHER(std::complex, cublasZsyr2) #undef SYR2_LAUNCHER template -inline void spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void spmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + x_, incx, (cuDataType*)&beta, y_, incy); }); }); } #define SPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \ incy); \ } @@ -567,28 +567,28 @@ SPMV_LAUNCHER(double, cublasDspmv) #undef SPMV_LAUNCHER template -inline void spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void spr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& a) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, a_); }); }); } #define SPR_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -598,32 +598,32 @@ SPR_LAUNCHER(double, cublasDspr) #undef SPR_LAUNCHER template -inline void spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void spr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_); }); }); } #define SPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -633,18 +633,18 @@ SPR2_LAUNCHER(double, cublasDspr2) #undef SPR2_LAUNCHER template -inline void tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx) { +inline void tbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -654,8 +654,8 @@ inline void tbmv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \ x, incx); \ @@ -669,18 +669,18 @@ TBMV_LAUNCHER(std::complex, cublasZtbmv) #undef TBMV_LAUNCHER template -inline void tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx) { +inline void tbsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -690,8 +690,8 @@ inline void tbsv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TBSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \ x, incx); \ @@ -705,18 +705,18 @@ TBSV_LAUNCHER(std::complex, cublasZtbsv) #undef TBSV_LAUNCHER template -inline void tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, - sycl::buffer &x, int64_t incx) { +inline void tpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, + sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -726,8 +726,8 @@ inline void tpmv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, \ incx); \ } @@ -740,18 +740,18 @@ TPMV_LAUNCHER(std::complex, cublasZtpmv) #undef TPMV_LAUNCHER template -inline void tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, - sycl::buffer &x, int64_t incx) { +inline void tpsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, + sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -761,8 +761,8 @@ inline void tpsv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TPSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, \ incx); \ } @@ -775,18 +775,18 @@ TPSV_LAUNCHER(std::complex, cublasZtpsv) #undef TPSV_LAUNCHER template -inline void trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { +inline void trmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -796,8 +796,8 @@ inline void trmv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TRMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \ incx); \ } @@ -810,18 +810,18 @@ TRMV_LAUNCHER(std::complex, cublasZtrmv) #undef TRMV_LAUNCHER template -inline void trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { +inline void trsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -831,8 +831,8 @@ inline void trsv(const char *func_name, Func func, sycl::queue &queue, uplo uppe } #define TRSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \ incx); \ } @@ -847,35 +847,35 @@ TRSV_LAUNCHER(std::complex, cublasZtrsv) // USM APIs template -inline sycl::event gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gemv(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m, - n, (cuDataType *)&alpha, a_, lda, x_, incx, - (cuDataType *)&beta, y_, incy); + n, (cuDataType*)&alpha, a_, lda, x_, incx, (cuDataType*)&beta, + y_, incy); }); }); return done; } #define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, \ beta, y, incy, dependencies); \ } @@ -887,36 +887,36 @@ GEMV_LAUNCHER_USM(std::complex, cublasZgemv) #undef GEMV_LAUNCHER_USM template -inline sycl::event gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T *a, - int64_t lda, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gbmv(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T* a, + int64_t lda, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, kl, ku, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), m, - n, kl, ku, (cuDataType *)&alpha, a_, lda, x_, incx, - (cuDataType *)&beta, y_, incy); + n, kl, ku, (cuDataType*)&alpha, a_, lda, x_, incx, + (cuDataType*)&beta, y_, incy); }); }); return done; } #define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ - int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, \ - int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ + int64_t ku, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* x, \ + int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -928,23 +928,23 @@ GBMV_LAUNCHER_USM(std::complex, cublasZgbmv) #undef GBMV_LAUNCHER_USM template -inline sycl::event ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, - T alpha, const T *x, int64_t incx, const T *y, int64_t incy, T *a, - int64_t lda, const std::vector &dependencies) { +inline sycl::event ger(const char* func_name, Func func, sycl::queue& queue, int64_t m, int64_t n, + T alpha, const T* x, int64_t incx, const T* y, int64_t incy, T* a, + int64_t lda, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; - CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType *)&alpha, x_, + CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -952,9 +952,9 @@ inline sycl::event ger(const char *func_name, Func func, sycl::queue &queue, int } #define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ - sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -968,35 +968,35 @@ GER_LAUNCHER_USM(c, std::complex, cublasZgerc) #undef GER_LAUNCHER_USM template -inline sycl::event hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event hbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, - a_, lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, k, (cuDataType*)&alpha, + a_, lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -1006,34 +1006,34 @@ HBMV_LAUNCHER_USM(std::complex, cublasZhbmv) #undef HBMV_LAUNCHER_USM template -inline sycl::event hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, - T beta, T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event hemv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, int64_t lda, const T* x, int64_t incx, + T beta, T* y, int64_t incy, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -1043,25 +1043,25 @@ HEMV_LAUNCHER_USM(std::complex, cublasZhemv) #undef HEMV_LAUNCHER_USM template -inline sycl::event her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, const ScalarType alpha, const DataType *x, int64_t incx, - DataType *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event her(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, const ScalarType alpha, const DataType* x, int64_t incx, + DataType* a, int64_t lda, const std::vector& dependencies) { using cuScalarType = typename CudaEquivalentType::Type; using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, + get_cublas_fill_mode(upper_lower), n, (cuScalarType*)&alpha, x_, incx, a_, lda); }); }); @@ -1069,9 +1069,9 @@ inline sycl::event her(const char *func_name, Func func, sycl::queue &queue, upl } #define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \ dependencies); \ } @@ -1082,24 +1082,24 @@ HER_LAUNCHER_USM(double, std::complex, cublasZher) #undef HER_LAUNCHER_USM template -inline sycl::event her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event her2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, int64_t lda, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -1107,9 +1107,9 @@ inline sycl::event her2(const char *func_name, Func func, sycl::queue &queue, up } #define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, lda, dependencies); \ } @@ -1120,34 +1120,34 @@ HER2_LAUNCHER_USM(std::complex, cublasZher2) #undef HER2_LAUNCHER_USM template -inline sycl::event hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, \ beta, y, incy, dependencies); \ } @@ -1158,24 +1158,24 @@ HPMV_LAUNCHER_USM(std::complex, cublasZhpmv) #undef HPMV_LAUNCHER_USM template -inline sycl::event hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, const ScalarType alpha, const DataType *x, int64_t incx, - DataType *a, const std::vector &dependencies) { +inline sycl::event hpr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, const ScalarType alpha, const DataType* x, int64_t incx, + DataType* a, const std::vector& dependencies) { using cuScalarType = typename CudaEquivalentType::Type; using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuScalarType *)&alpha, + get_cublas_fill_mode(upper_lower), n, (cuScalarType*)&alpha, x_, incx, a_); }); }); @@ -1183,9 +1183,9 @@ inline sycl::event hpr(const char *func_name, Func func, sycl::queue &queue, upl } #define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, \ + const std::vector& dependencies) { \ return hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \ dependencies); \ } @@ -1196,24 +1196,24 @@ HPR_LAUNCHER_USM(double, std::complex, cublasZhpr) #undef HPR_LAUNCHER_USM template -inline sycl::event hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, const std::vector &dependencies) { +inline sycl::event hpr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_); }); }); @@ -1221,9 +1221,9 @@ inline sycl::event hpr2(const char *func_name, Func func, sycl::queue &queue, up } #define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, dependencies); \ } @@ -1234,35 +1234,35 @@ HPR2_LAUNCHER_USM(std::complex, cublasZhpr2) #undef HPR2_LAUNCHER_USM template -inline sycl::event sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event sbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, k, (cuDataType *)&alpha, - a_, lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, k, (cuDataType*)&alpha, + a_, lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -1273,34 +1273,34 @@ SBMV_LAUNCHER_USM(double, cublasDsbmv) #undef SBMV_LAUNCHER_USM template -inline sycl::event symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, - T beta, T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event symv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, int64_t lda, const T* x, int64_t incx, + T beta, T* y, int64_t incy, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - lda, x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + lda, x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -1311,23 +1311,23 @@ SYMV_LAUNCHER_USM(double, cublasDsymv) #undef SYMV_LAUNCHER_USM template -inline sycl::event syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, T* a, int64_t lda, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, a_, lda); }); }); @@ -1335,9 +1335,9 @@ inline sycl::event syr(const char *func_name, Func func, sycl::queue &queue, upl } #define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \ dependencies); \ } @@ -1350,24 +1350,24 @@ SYR_LAUNCHER_USM(std::complex, cublasZsyr) #undef SYR_LAUNCHER_USM template -inline sycl::event syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event syr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, int64_t lda, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -1375,9 +1375,9 @@ inline sycl::event syr2(const char *func_name, Func func, sycl::queue &queue, up } #define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, lda, dependencies); \ } @@ -1391,34 +1391,34 @@ SYR2_LAUNCHER_USM(std::complex, cublasZsyr2) #undef SYR2_LAUNCHER_USM template -inline sycl::event spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event spmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, a_, - x_, incx, (cuDataType *)&beta, y_, incy); + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, a_, + x_, incx, (cuDataType*)&beta, y_, incy); }); }); return done; } #define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, \ beta, y, incy, dependencies); \ } @@ -1429,23 +1429,23 @@ SPMV_LAUNCHER_USM(double, cublasDspmv) #undef SPMV_LAUNCHER_USM template -inline sycl::event spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, T *a, - const std::vector &dependencies) { +inline sycl::event spr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, T* a, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, a_); }); }); @@ -1453,8 +1453,8 @@ inline sycl::event spr(const char *func_name, Func func, sycl::queue &queue, upl } #define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, const std::vector &dependencies) { \ + sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, const std::vector& dependencies) { \ return spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \ dependencies); \ } @@ -1465,24 +1465,24 @@ SPR_LAUNCHER_USM(double, cublasDspr) #undef SPR_LAUNCHER_USM template -inline sycl::event spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, const std::vector &dependencies) { +inline sycl::event spr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, - get_cublas_fill_mode(upper_lower), n, (cuDataType *)&alpha, x_, + get_cublas_fill_mode(upper_lower), n, (cuDataType*)&alpha, x_, incx, y_, incy, a_); }); }); @@ -1490,9 +1490,9 @@ inline sycl::event spr2(const char *func_name, Func func, sycl::queue &queue, up } #define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, dependencies); \ } @@ -1503,21 +1503,21 @@ SPR2_LAUNCHER_USM(double, cublasDspr2) #undef SPR2_LAUNCHER_USM template -inline sycl::event tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a, - int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, const T* a, + int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1528,9 +1528,9 @@ inline sycl::event tbmv(const char *func_name, Func func, sycl::queue &queue, up } #define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \ a, lda, x, incx, dependencies); \ } @@ -1543,21 +1543,21 @@ TBMV_LAUNCHER_USM(std::complex, cublasZtbmv) #undef TBMV_LAUNCHER_USM template -inline sycl::event tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a, - int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, const T* a, + int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1568,9 +1568,9 @@ inline sycl::event tbsv(const char *func_name, Func func, sycl::queue &queue, up } #define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \ a, lda, x, incx, dependencies); \ } @@ -1583,20 +1583,20 @@ TBSV_LAUNCHER_USM(std::complex, cublasZtbsv) #undef TBSV_LAUNCHER_USM template -inline sycl::event tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1607,9 +1607,9 @@ inline sycl::event tpmv(const char *func_name, Func func, sycl::queue &queue, up } #define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ x, incx, dependencies); \ } @@ -1622,20 +1622,20 @@ TPMV_LAUNCHER_USM(std::complex, cublasZtpmv) #undef TPMV_LAUNCHER_USM template -inline sycl::event tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1646,9 +1646,9 @@ inline sycl::event tpsv(const char *func_name, Func func, sycl::queue &queue, up } #define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ x, incx, dependencies); \ } @@ -1661,20 +1661,20 @@ TPSV_LAUNCHER_USM(std::complex, cublasZtpsv) #undef TPSV_LAUNCHER_USM template -inline sycl::event trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event trmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1685,9 +1685,9 @@ inline sycl::event trmv(const char *func_name, Func func, sycl::queue &queue, up } #define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ lda, x, incx, dependencies); \ } @@ -1700,20 +1700,20 @@ TRMV_LAUNCHER_USM(std::complex, cublasZtrmv) #undef TRMV_LAUNCHER_USM template -inline sycl::event trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event trsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), @@ -1724,9 +1724,9 @@ inline sycl::event trsv(const char *func_name, Func func, sycl::queue &queue, up } #define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ lda, x, incx, dependencies); \ } @@ -1744,16 +1744,16 @@ namespace row_major { // Buffer APIs template -inline void gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gemv(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "gemv", "for row_major layout"); } #define GEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \ incy); \ } @@ -1765,16 +1765,16 @@ GEMV_LAUNCHER(std::complex, cublasZgemv) #undef GEMV_LAUNCHER template -inline void gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gbmv(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, int64_t kl, int64_t ku, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx, T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "gbmv", "for row_major layout"); } #define GBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ - int64_t incx, TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ + int64_t incx, TYPE beta, sycl::buffer& y, int64_t incy) { \ gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -1786,15 +1786,15 @@ GBMV_LAUNCHER(std::complex, cublasZgbmv) #undef GBMV_LAUNCHER template -inline void ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void ger(const char* func_name, Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { throw unimplemented("blas", "ger", "for row_major layout"); } #define GER_LAUNCHER(EXT, TYPE, CUBLAS_ROUTINE) \ - void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, \ + void ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, \ int64_t lda) { \ ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda); \ } @@ -1808,16 +1808,16 @@ GER_LAUNCHER(c, std::complex, cublasZgerc) #undef GER_LAUNCHER template -inline void hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void hbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "hbmv", "for row_major layout"); } #define HBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -1827,16 +1827,16 @@ HBMV_LAUNCHER(std::complex, cublasZhbmv) #undef HBMV_LAUNCHER template -inline void hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - T beta, sycl::buffer &y, int64_t incy) { +inline void hemv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "hemv", "for row_major layout"); } #define HEMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \ y, incy); \ } @@ -1846,15 +1846,15 @@ HEMV_LAUNCHER(std::complex, cublasZhemv) #undef HEMV_LAUNCHER template -inline void her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - ScalarType alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &a, int64_t lda) { +inline void her(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + ScalarType alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& a, int64_t lda) { throw unimplemented("blas", "her", "for row_major layout"); } #define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, \ + void her(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, \ int64_t lda) { \ her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -1865,16 +1865,16 @@ HER_LAUNCHER(double, std::complex, cublasZher) #undef HER_LAUNCHER template -inline void her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void her2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { throw unimplemented("blas", "her2", "for row_major layout"); } #define HER2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ lda); \ } @@ -1885,16 +1885,16 @@ HER2_LAUNCHER(std::complex, cublasZher2) #undef HER2_LAUNCHER template -inline void hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "hpmv", "for row_major layout"); } #define HPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \ incy); \ } @@ -1905,15 +1905,15 @@ HPMV_LAUNCHER(std::complex, cublasZhpmv) #undef HPMV_LAUNCHER template -inline void hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - ScalarType alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &a) { +inline void hpr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + ScalarType alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& a) { throw unimplemented("blas", "hpr", "for row_major layout"); } #define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -1923,16 +1923,16 @@ HPR_LAUNCHER(double, std::complex, cublasZhpr) #undef HPR_LAUNCHER template -inline void hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void hpr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { throw unimplemented("blas", "hpr2", "for row_major layout"); } #define HPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -1942,16 +1942,16 @@ HPR2_LAUNCHER(std::complex, cublasZhpr2) #undef HPR2_LAUNCHER template -inline void sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void sbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "sbmv", "for row_major layout"); } #define SBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, \ beta, y, incy); \ } @@ -1962,16 +1962,16 @@ SBMV_LAUNCHER(double, cublasDsbmv) #undef SBMV_LAUNCHER template -inline void symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - T beta, sycl::buffer &y, int64_t incy) { +inline void symv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + T beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "symv", "for row_major layout"); } #define SYMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, \ y, incy); \ } @@ -1982,14 +1982,14 @@ SYMV_LAUNCHER(double, cublasDsymv) #undef SYMV_LAUNCHER template -inline void syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { +inline void syr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { throw unimplemented("blas", "syr", "for row_major layout"); } #define SYR_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { \ + void syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { \ syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -2001,16 +2001,16 @@ SYR_LAUNCHER(std::complex, cublasZsyr) #undef SYR_LAUNCHER template -inline void syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void syr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { throw unimplemented("blas", "syr2", "for row_major layout"); } #define SYR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ lda); \ } @@ -2024,16 +2024,16 @@ SYR2_LAUNCHER(std::complex, cublasZsyr2) #undef SYR2_LAUNCHER template -inline void spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void spmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "spmv", "for row_major layout"); } #define SPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, \ incy); \ } @@ -2044,14 +2044,14 @@ SPMV_LAUNCHER(double, cublasDspmv) #undef SPMV_LAUNCHER template -inline void spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void spr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& a) { throw unimplemented("blas", "spr", "for row_major layout"); } #define SPR_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -2061,16 +2061,16 @@ SPR_LAUNCHER(double, cublasDspr) #undef SPR_LAUNCHER template -inline void spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - T alpha, sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void spr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + T alpha, sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { throw unimplemented("blas", "spr2", "for row_major layout"); } #define SPR2_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -2080,15 +2080,15 @@ SPR2_LAUNCHER(double, cublasDspr2) #undef SPR2_LAUNCHER template -inline void tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx) { +inline void tbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "tbmv", "for row_major layout"); } #define TBMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \ x, incx); \ @@ -2102,15 +2102,15 @@ TBMV_LAUNCHER(std::complex, cublasZtbmv) #undef TBMV_LAUNCHER template -inline void tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx) { +inline void tbsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "tbsv", "for row_major layout"); } #define TBSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, \ x, incx); \ @@ -2124,15 +2124,15 @@ TBSV_LAUNCHER(std::complex, cublasZtbsv) #undef TBSV_LAUNCHER template -inline void tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, - sycl::buffer &x, int64_t incx) { +inline void tpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, + sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "tpmv", "for row_major layout"); } #define TPMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, \ incx); \ } @@ -2145,15 +2145,15 @@ TPMV_LAUNCHER(std::complex, cublasZtpmv) #undef TPMV_LAUNCHER template -inline void tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, - sycl::buffer &x, int64_t incx) { +inline void tpsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, + sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "tpsv", "for row_major layout"); } #define TPSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, \ incx); \ } @@ -2166,15 +2166,15 @@ TPSV_LAUNCHER(std::complex, cublasZtpsv) #undef TPSV_LAUNCHER template -inline void trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { +inline void trmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "trmv", "for row_major layout"); } #define TRMV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \ incx); \ } @@ -2187,15 +2187,15 @@ TRMV_LAUNCHER(std::complex, cublasZtrmv) #undef TRMV_LAUNCHER template -inline void trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, sycl::buffer &a, int64_t lda, - sycl::buffer &x, int64_t incx) { +inline void trsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, sycl::buffer& a, int64_t lda, + sycl::buffer& x, int64_t incx) { throw unimplemented("blas", "trsv", "for row_major layout"); } #define TRSV_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, \ incx); \ } @@ -2210,17 +2210,17 @@ TRSV_LAUNCHER(std::complex, cublasZtrsv) // USM APIs template -inline sycl::event gemv(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gemv(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "gemv", "for row_major layout"); } #define GEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return gemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, \ beta, y, incy, dependencies); \ } @@ -2232,18 +2232,18 @@ GEMV_LAUNCHER_USM(std::complex, cublasZgemv) #undef GEMV_LAUNCHER_USM template -inline sycl::event gbmv(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T *a, - int64_t lda, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gbmv(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, int64_t kl, int64_t ku, T alpha, const T* a, + int64_t lda, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "gbmv", "for row_major layout"); } #define GBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ - int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, \ - int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ + int64_t ku, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* x, \ + int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return gbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -2255,16 +2255,16 @@ GBMV_LAUNCHER_USM(std::complex, cublasZgbmv) #undef GBMV_LAUNCHER_USM template -inline sycl::event ger(const char *func_name, Func func, sycl::queue &queue, int64_t m, int64_t n, - T alpha, const T *x, int64_t incx, const T *y, int64_t incy, T *a, - int64_t lda, const std::vector &dependencies) { +inline sycl::event ger(const char* func_name, Func func, sycl::queue& queue, int64_t m, int64_t n, + T alpha, const T* x, int64_t incx, const T* y, int64_t incy, T* a, + int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "ger", "for row_major layout"); } #define GER_LAUNCHER_USM(EXT, TYPE, CUBLAS_ROUTINE) \ - sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return ger(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -2278,17 +2278,17 @@ GER_LAUNCHER_USM(c, std::complex, cublasZgerc) #undef GER_LAUNCHER_USM template -inline sycl::event hbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event hbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "hbmv", "for row_major layout"); } #define HBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return hbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -2298,16 +2298,16 @@ HBMV_LAUNCHER_USM(std::complex, cublasZhbmv) #undef HBMV_LAUNCHER_USM template -inline sycl::event hemv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, - T beta, T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event hemv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, int64_t lda, const T* x, int64_t incx, + T beta, T* y, int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "hemv", "for row_major layout"); } #define HEMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hemv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -2317,16 +2317,16 @@ HEMV_LAUNCHER_USM(std::complex, cublasZhemv) #undef HEMV_LAUNCHER_USM template -inline sycl::event her(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, const ScalarType alpha, const DataType *x, int64_t incx, - DataType *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event her(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, const ScalarType alpha, const DataType* x, int64_t incx, + DataType* a, int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "her", "for row_major layout"); } #define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \ dependencies); \ } @@ -2337,16 +2337,16 @@ HER_LAUNCHER_USM(double, std::complex, cublasZher) #undef HER_LAUNCHER_USM template -inline sycl::event her2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event her2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "her2", "for row_major layout"); } #define HER2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, lda, dependencies); \ } @@ -2357,16 +2357,16 @@ HER2_LAUNCHER_USM(std::complex, cublasZher2) #undef HER2_LAUNCHER_USM template -inline sycl::event hpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "hpmv", "for row_major layout"); } #define HPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, \ beta, y, incy, dependencies); \ } @@ -2377,16 +2377,16 @@ HPMV_LAUNCHER_USM(std::complex, cublasZhpmv) #undef HPMV_LAUNCHER_USM template -inline sycl::event hpr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, const ScalarType alpha, const DataType *x, int64_t incx, - DataType *a, const std::vector &dependencies) { +inline sycl::event hpr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, const ScalarType alpha, const DataType* x, int64_t incx, + DataType* a, const std::vector& dependencies) { throw unimplemented("blas", "hpr", "for row_major layout"); } #define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, CUBLAS_ROUTINE) \ - sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, \ + const std::vector& dependencies) { \ return hpr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \ dependencies); \ } @@ -2397,16 +2397,16 @@ HPR_LAUNCHER_USM(double, std::complex, cublasZhpr) #undef HPR_LAUNCHER_USM template -inline sycl::event hpr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, const std::vector &dependencies) { +inline sycl::event hpr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, const std::vector& dependencies) { throw unimplemented("blas", "hpr2", "for row_major layout"); } #define HPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return hpr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, dependencies); \ } @@ -2417,17 +2417,17 @@ HPR2_LAUNCHER_USM(std::complex, cublasZhpr2) #undef HPR2_LAUNCHER_USM template -inline sycl::event sbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event sbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "sbmv", "for row_major layout"); } #define SBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return sbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -2438,16 +2438,16 @@ SBMV_LAUNCHER_USM(double, cublasDsbmv) #undef SBMV_LAUNCHER_USM template -inline sycl::event symv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, int64_t lda, const T *x, int64_t incx, - T beta, T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event symv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, int64_t lda, const T* x, int64_t incx, + T beta, T* y, int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "symv", "for row_major layout"); } #define SYMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return symv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, \ incx, beta, y, incy, dependencies); \ } @@ -2458,16 +2458,16 @@ SYMV_LAUNCHER_USM(double, cublasDsymv) #undef SYMV_LAUNCHER_USM template -inline sycl::event syr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, T* a, int64_t lda, + const std::vector& dependencies) { throw unimplemented("blas", "syr", "for row_major layout"); } #define SYR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, \ dependencies); \ } @@ -2480,16 +2480,16 @@ SYR_LAUNCHER_USM(std::complex, cublasZsyr) #undef SYR_LAUNCHER_USM template -inline sycl::event syr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, int64_t lda, const std::vector &dependencies) { +inline sycl::event syr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "syr2", "for row_major layout"); } #define SYR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, lda, dependencies); \ } @@ -2503,16 +2503,16 @@ SYR2_LAUNCHER_USM(std::complex, cublasZsyr2) #undef SYR2_LAUNCHER_USM template -inline sycl::event spmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *a, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event spmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* a, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "spmv", "for row_major layout"); } #define SPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return spmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, \ beta, y, incy, dependencies); \ } @@ -2523,15 +2523,15 @@ SPMV_LAUNCHER_USM(double, cublasDspmv) #undef SPMV_LAUNCHER_USM template -inline sycl::event spr(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, T *a, - const std::vector &dependencies) { +inline sycl::event spr(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, T* a, + const std::vector& dependencies) { throw unimplemented("blas", "spr", "for row_major layout"); } #define SPR_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, const std::vector &dependencies) { \ + sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, const std::vector& dependencies) { \ return spr(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, \ dependencies); \ } @@ -2542,16 +2542,16 @@ SPR_LAUNCHER_USM(double, cublasDspr) #undef SPR_LAUNCHER_USM template -inline sycl::event spr2(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - int64_t n, T alpha, const T *x, int64_t incx, const T *y, int64_t incy, - T *a, const std::vector &dependencies) { +inline sycl::event spr2(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + int64_t n, T alpha, const T* x, int64_t incx, const T* y, int64_t incy, + T* a, const std::vector& dependencies) { throw unimplemented("blas", "spr2", "for row_major layout"); } #define SPR2_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return spr2(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, \ incy, a, dependencies); \ } @@ -2562,17 +2562,17 @@ SPR2_LAUNCHER_USM(double, cublasDspr2) #undef SPR2_LAUNCHER_USM template -inline sycl::event tbmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a, - int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, const T* a, + int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tbmv", "for row_major layout"); } #define TBMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \ a, lda, x, incx, dependencies); \ } @@ -2585,17 +2585,17 @@ TBMV_LAUNCHER_USM(std::complex, cublasZtbmv) #undef TBMV_LAUNCHER_USM template -inline sycl::event tbsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, int64_t k, const T *a, - int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, int64_t k, const T* a, + int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tbsv", "for row_major layout"); } #define TBSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, \ a, lda, x, incx, dependencies); \ } @@ -2608,16 +2608,16 @@ TBSV_LAUNCHER_USM(std::complex, cublasZtbsv) #undef TBSV_LAUNCHER_USM template -inline sycl::event tpmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tpmv", "for row_major layout"); } #define TPMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ x, incx, dependencies); \ } @@ -2630,16 +2630,16 @@ TPMV_LAUNCHER_USM(std::complex, cublasZtpmv) #undef TPMV_LAUNCHER_USM template -inline sycl::event tpsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tpsv", "for row_major layout"); } #define TPSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ x, incx, dependencies); \ } @@ -2652,16 +2652,16 @@ TPSV_LAUNCHER_USM(std::complex, cublasZtpsv) #undef TPSV_LAUNCHER_USM template -inline sycl::event trmv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event trmv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "trmv", "for row_major layout"); } #define TRMV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trmv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ lda, x, incx, dependencies); \ } @@ -2674,16 +2674,16 @@ TRMV_LAUNCHER_USM(std::complex, cublasZtrmv) #undef TRMV_LAUNCHER_USM template -inline sycl::event trsv(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event trsv(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "trsv", "for row_major layout"); } #define TRSV_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trsv(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, \ lda, x, incx, dependencies); \ } diff --git a/src/blas/backends/cublas/cublas_level3.cpp b/src/blas/backends/cublas/cublas_level3.cpp index 5ea4e2152..da94de959 100644 --- a/src/blas/backends/cublas/cublas_level3.cpp +++ b/src/blas/backends/cublas/cublas_level3.cpp @@ -31,33 +31,33 @@ namespace column_major { // Buffer APIs template -inline void gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void gemm(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_operation(transb), m, n, k, (cuDataType*)&alpha, a_, + lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); } #define GEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE beta, sycl::buffer& c, \ int64_t ldc) { \ gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ @@ -72,15 +72,15 @@ GEMM_LAUNCHER(std::complex, cublasZgemm) template -inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue &queue, +inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, T_C alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - T_C beta, sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + T_C beta, sycl::buffer& c, int64_t ldc) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; using cuDataType_C = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { if (!verify_support(queue, sycl::aspect::fp16)) { throw oneapi::mkl::unimplemented( "blas", "sycl::half", "half is not supported by the device or the sycl compiler"); @@ -88,24 +88,24 @@ inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::que auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_SYNC(cublasGemmEx, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha, - a_, DT_A, lda, b_, DT_B, ldb, (cuDataType_C *)&beta, c_, DT_C, - ldc, DT_C, CUBLAS_GEMM_DEFAULT); + get_cublas_operation(transb), m, n, k, (cuDataType_C*)&alpha, a_, + DT_A, lda, b_, DT_B, ldb, (cuDataType_C*)&beta, c_, DT_C, ldc, + DT_C, CUBLAS_GEMM_DEFAULT); }); }); } #define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_C alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE_C beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_C alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE_C beta, sycl::buffer& c, \ int64_t ldc) { \ gemm_ex(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, m, n, k, \ alpha, a, lda, b, ldb, beta, c, ldc); \ @@ -116,40 +116,40 @@ GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUD #undef GEMM_EX_LAUNCHER -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemm", "for column_major layout"); } template -inline void symm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void symm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), - get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_fill_mode(upper_lower), m, n, (cuDataType*)&alpha, + a_, lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); } #define SYMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \ b, ldb, beta, c, ldc); \ } @@ -162,33 +162,33 @@ SYMM_LAUNCHER(std::complex, cublasZsymm) #undef SYMM_LAUNCHER template -inline void hemm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void hemm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), - get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_fill_mode(upper_lower), m, n, (cuDataType*)&alpha, + a_, lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); } #define HEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \ b, ldb, beta, c, ldc); \ } @@ -198,31 +198,31 @@ HEMM_LAUNCHER(std::complex, cublasZhemm) #undef HEMM_LAUNCHER template -inline void syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer &a, int64_t lda, - T beta, sycl::buffer &c, int64_t ldc) { +inline void syrk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer& a, int64_t lda, + T beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, (cuDataType *)&beta, c_, + n, k, (cuDataType*)&alpha, a_, lda, (cuDataType*)&beta, c_, ldc); }); }); } #define SYRK_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ beta, c, ldc); \ } @@ -235,33 +235,33 @@ SYRK_LAUNCHER(std::complex, cublasZsyrk) #undef SYRK_LAUNCHER template -inline void herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline void herk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, ScalarType alpha, - sycl::buffer &a, int64_t lda, ScalarType beta, - sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, ScalarType beta, + sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; using cuScalarType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuScalarType *)&alpha, a_, lda, (cuScalarType *)&beta, - c_, ldc); + n, k, (cuScalarType*)&alpha, a_, lda, (cuScalarType*)&beta, c_, + ldc); }); }); } #define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - SCALAR_TYPE alpha, sycl::buffer &a, int64_t lda, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + SCALAR_TYPE alpha, sycl::buffer& a, int64_t lda, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ beta, c, ldc); \ } @@ -272,34 +272,34 @@ HERK_LAUNCHER(std::complex, double, cublasZherk) #undef HERK_LAUNCHER template -inline void syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void syr2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, b_, ldb, - (cuDataType *)&beta, c_, ldc); + n, k, (cuDataType*)&alpha, a_, lda, b_, ldb, + (cuDataType*)&beta, c_, ldc); }); }); } #define SYR2K_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ } @@ -311,36 +311,36 @@ SYR2K_LAUNCHER(std::complex, cublasZsyr2k) #undef SYR2K_LAUNCHER template -inline void her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline void her2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, DataType alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, ScalarType beta, sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, ScalarType beta, sycl::buffer& c, int64_t ldc) { using cuDataType = typename CudaEquivalentType::Type; using cuScalarType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, b_, ldb, - (cuScalarType *)&beta, c_, ldc); + n, k, (cuDataType*)&alpha, a_, lda, b_, ldb, + (cuScalarType*)&beta, c_, ldc); }); }); } #define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - DATA_TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + DATA_TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ } @@ -355,31 +355,31 @@ HER2K_LAUNCHER(std::complex, double, cublasZher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline void trmm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline void trmm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, b_, ldb); + get_cublas_diag_type(unit_diag), m, n, (cuDataType*)&alpha, a_, + lda, b_, ldb, b_, ldb); }); }); } #define TRMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \ n, alpha, a, lda, b, ldb); \ } @@ -391,31 +391,31 @@ TRMM_LAUNCHER(std::complex, cublasZtrmm) #undef TRMM_LAUNCHER template -inline void trsm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline void trsm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb); + get_cublas_diag_type(unit_diag), m, n, (cuDataType*)&alpha, a_, + lda, b_, ldb); }); }); } #define TRSM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \ n, alpha, a, lda, b, ldb); \ } @@ -429,36 +429,36 @@ TRSM_LAUNCHER(std::complex, cublasZtrsm) // USM APIs template -inline sycl::event gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T *a, - int64_t lda, const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event gemm(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T* a, + int64_t lda, const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_operation(transb), m, n, k, (cuDataType*)&alpha, a_, + lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); return done; } #define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ - int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, \ + int64_t ldb, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -473,29 +473,29 @@ GEMM_LAUNCHER_USM(std::complex, cublasZgemm) template inline sycl::event gemm_ex_usm(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, - sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, T_C alpha, const T_A *a, int64_t lda, - const T_B *b, int64_t ldb, T_C beta, T_C *c, int64_t ldc, - const std::vector &dependencies) { + sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, T_C alpha, const T_A* a, int64_t lda, + const T_B* b, int64_t ldb, T_C beta, T_C* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; using cuDataType_C = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_SYNC(cublasGemmEx, err, handle, get_cublas_operation(transa), - get_cublas_operation(transb), m, n, k, (cuDataType_C *)&alpha, - a_, DT_A, lda, b_, DT_B, ldb, (cuDataType_C *)&beta, c_, DT_C, - ldc, DT_C, CUBLAS_GEMM_DEFAULT); + get_cublas_operation(transb), m, n, k, (cuDataType_C*)&alpha, a_, + DT_A, lda, b_, DT_B, ldb, (cuDataType_C*)&beta, c_, DT_C, ldc, + DT_C, CUBLAS_GEMM_DEFAULT); }); }); return done; @@ -503,10 +503,10 @@ inline sycl::event gemm_ex_usm(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C #define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, \ CUDADATATYPE_C) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_C alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b, \ - int64_t ldb, TYPE_C beta, TYPE_C *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_C alpha, const TYPE_A* a, int64_t lda, const TYPE_B* b, \ + int64_t ldb, TYPE_C beta, TYPE_C* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm_ex_usm(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, \ m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -516,44 +516,44 @@ GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, #undef GEMM_EX_LAUNCHER_USM -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const bfloat16* a, int64_t lda, const bfloat16* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", "for column_major layout"); } template -inline sycl::event symm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event symm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), - get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_fill_mode(upper_lower), m, n, (cuDataType*)&alpha, + a_, lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); return done; } #define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \ a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -566,36 +566,36 @@ SYMM_LAUNCHER_USM(std::complex, cublasZsymm) #undef SYMM_LAUNCHER_USM template -inline sycl::event hemm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event hemm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), - get_cublas_fill_mode(upper_lower), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, (cuDataType *)&beta, c_, ldc); + get_cublas_fill_mode(upper_lower), m, n, (cuDataType*)&alpha, + a_, lda, b_, ldb, (cuDataType*)&beta, c_, ldc); }); }); return done; } #define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \ a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -605,24 +605,24 @@ HEMM_LAUNCHER_USM(std::complex, cublasZhemm) #undef HEMM_LAUNCHER_USM template -inline sycl::event syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event syrk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, (cuDataType *)&beta, c_, + n, k, (cuDataType*)&alpha, a_, lda, (cuDataType*)&beta, c_, ldc); }); }); @@ -630,9 +630,9 @@ inline sycl::event syrk(const char *func_name, Func func, sycl::queue &queue, up } #define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, beta, c, ldc, dependencies); \ } @@ -645,37 +645,37 @@ SYRK_LAUNCHER_USM(std::complex, cublasZsyrk) #undef SYRK_LAUNCHER_USM template -inline sycl::event herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline sycl::event herk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, const ScalarType alpha, - const DataType *a, int64_t lda, const ScalarType beta, DataType *c, - int64_t ldc, const std::vector &dependencies) { + const DataType* a, int64_t lda, const ScalarType beta, DataType* c, + int64_t ldc, const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; using cuScalarType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuScalarType *)&alpha, a_, lda, (cuScalarType *)&beta, - c_, ldc); + n, k, (cuScalarType*)&alpha, a_, lda, (cuScalarType*)&beta, c_, + ldc); }); }); return done; } #define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ - const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const SCALAR_TYPE alpha, const DATA_TYPE* a, int64_t lda, \ + const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, beta, c, ldc, dependencies); \ } @@ -686,37 +686,37 @@ HERK_LAUNCHER_USM(std::complex, double, cublasZherk) #undef HERK_LAUNCHER_USM template -inline sycl::event syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syr2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, b_, ldb, - (cuDataType *)&beta, c_, ldc); + n, k, (cuDataType*)&alpha, a_, lda, b_, ldb, + (cuDataType*)&beta, c_, ldc); }); }); return done; } #define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -728,39 +728,39 @@ SYR2K_LAUNCHER_USM(std::complex, cublasZsyr2k) #undef SYR2K_LAUNCHER_USM template -inline sycl::event her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline sycl::event her2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, const DataType alpha, - const DataType *a, int64_t lda, const DataType *b, int64_t ldb, - const ScalarType beta, DataType *c, int64_t ldc, - const std::vector &dependencies) { + const DataType* a, int64_t lda, const DataType* b, int64_t ldb, + const ScalarType beta, DataType* c, int64_t ldc, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; using cuScalarType = typename CudaEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - n, k, (cuDataType *)&alpha, a_, lda, b_, ldb, - (cuScalarType *)&beta, c_, ldc); + n, k, (cuDataType*)&alpha, a_, lda, b_, ldb, + (cuScalarType*)&beta, c_, ldc); }); }); return done; } #define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b, \ - int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const DATA_TYPE alpha, const DATA_TYPE* a, int64_t lda, const DATA_TYPE* b, \ + int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -775,35 +775,35 @@ HER2K_LAUNCHER_USM(std::complex, double, cublasZher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline sycl::event trmm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline sycl::event trmm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { + T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb, b_, ldb); + get_cublas_diag_type(unit_diag), m, n, (cuDataType*)&alpha, a_, + lda, b_, ldb, b_, ldb); }); }); return done; } #define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); \ } @@ -815,35 +815,35 @@ TRMM_LAUNCHER_USM(std::complex, cublasZtrmm) #undef TRMM_LAUNCHER_USM template -inline sycl::event trsm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline sycl::event trsm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { + T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler &sc) { + onemkl_cublas_host_task(cgh, queue, [=](CublasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cublasStatus_t err; CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(left_right), get_cublas_fill_mode(upper_lower), get_cublas_operation(trans), - get_cublas_diag_type(unit_diag), m, n, (cuDataType *)&alpha, - a_, lda, b_, ldb); + get_cublas_diag_type(unit_diag), m, n, (cuDataType*)&alpha, a_, + lda, b_, ldb); }); }); return done; } #define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); \ } @@ -860,17 +860,17 @@ namespace row_major { // Buffer APIs template -inline void gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void gemm(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, int64_t k, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemm", "for row_major layout"); } #define GEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE beta, sycl::buffer& c, \ int64_t ldc) { \ gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ @@ -885,17 +885,17 @@ GEMM_LAUNCHER(std::complex, cublasZgemm) template -inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue &queue, +inline void gemm_ex(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, T_C alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - T_C beta, sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + T_C beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemm", "for row_major layout"); } #define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_C alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE_C beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_C alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE_C beta, sycl::buffer& c, \ int64_t ldc) { \ gemm_ex(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, m, n, k, \ alpha, a, lda, b, ldb, beta, c, ldc); \ @@ -906,24 +906,24 @@ GEMM_EX_LAUNCHER(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, CUD #undef GEMM_EX_LAUNCHER -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemm", "for row_major layout"); } template -inline void symm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void symm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "symm", "for row_major layout"); } #define SYMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \ b, ldb, beta, c, ldc); \ } @@ -936,17 +936,17 @@ SYMM_LAUNCHER(std::complex, cublasZsymm) #undef SYMM_LAUNCHER template -inline void hemm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void hemm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "hemm", "for row_major layout"); } #define HEMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, \ b, ldb, beta, c, ldc); \ } @@ -956,16 +956,16 @@ HEMM_LAUNCHER(std::complex, cublasZhemm) #undef HEMM_LAUNCHER template -inline void syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer &a, int64_t lda, - T beta, sycl::buffer &c, int64_t ldc) { +inline void syrk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer& a, int64_t lda, + T beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "syrk", "for row_major layout"); } #define SYRK_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ beta, c, ldc); \ } @@ -978,17 +978,17 @@ SYRK_LAUNCHER(std::complex, cublasZsyrk) #undef SYRK_LAUNCHER template -inline void herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline void herk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, ScalarType alpha, - sycl::buffer &a, int64_t lda, ScalarType beta, - sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, ScalarType beta, + sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "herk", "for row_major layout"); } #define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - SCALAR_TYPE alpha, sycl::buffer &a, int64_t lda, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + SCALAR_TYPE alpha, sycl::buffer& a, int64_t lda, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ beta, c, ldc); \ } @@ -999,17 +999,17 @@ HERK_LAUNCHER(std::complex, double, cublasZherk) #undef HERK_LAUNCHER template -inline void syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, +inline void syr2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "syr2k", "for row_major layout"); } #define SYR2K_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ } @@ -1021,18 +1021,18 @@ SYR2K_LAUNCHER(std::complex, cublasZsyr2k) #undef SYR2K_LAUNCHER template -inline void her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline void her2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, DataType alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, ScalarType beta, sycl::buffer &c, int64_t ldc) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, ScalarType beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "her2k", "for row_major layout"); } #define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - DATA_TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + DATA_TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, \ ldb, beta, c, ldc); \ } @@ -1047,16 +1047,16 @@ HER2K_LAUNCHER(std::complex, double, cublasZher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline void trmm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline void trmm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { throw unimplemented("blas", "trmm", "for row_major layout"); } #define TRMM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \ n, alpha, a, lda, b, ldb); \ } @@ -1068,16 +1068,16 @@ TRMM_LAUNCHER(std::complex, cublasZtrmm) #undef TRMM_LAUNCHER template -inline void trsm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline void trsm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { throw unimplemented("blas", "trsm", "for row_major layout"); } #define TRSM_LAUNCHER(TYPE, CUBLAS_ROUTINE) \ - void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, \ n, alpha, a, lda, b, ldb); \ } @@ -1091,18 +1091,18 @@ TRSM_LAUNCHER(std::complex, cublasZtrsm) // USM APIs template -inline sycl::event gemm(const char *func_name, Func func, sycl::queue &queue, transpose transa, - transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T *a, - int64_t lda, const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event gemm(const char* func_name, Func func, sycl::queue& queue, transpose transa, + transpose transb, int64_t m, int64_t n, int64_t k, T alpha, const T* a, + int64_t lda, const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", "for row_major layout"); } #define GEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ - int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, \ + int64_t ldb, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1117,19 +1117,19 @@ GEMM_LAUNCHER_USM(std::complex, cublasZgemm) template inline sycl::event gemm_ex_usm(DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, - sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, T_C alpha, const T_A *a, int64_t lda, - const T_B *b, int64_t ldb, T_C beta, T_C *c, int64_t ldc, - const std::vector &dependencies) { + sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, T_C alpha, const T_A* a, int64_t lda, + const T_B* b, int64_t ldb, T_C beta, T_C* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", "for row_major layout"); } #define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, CUDADATATYPE_A, CUDADATATYPE_B, \ CUDADATATYPE_C) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_C alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b, \ - int64_t ldb, TYPE_C beta, TYPE_C *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_C alpha, const TYPE_A* a, int64_t lda, const TYPE_B* b, \ + int64_t ldb, TYPE_C beta, TYPE_C* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm_ex_usm(CUDADATATYPE_A, CUDADATATYPE_B, CUDADATATYPE_C, queue, transa, transb, \ m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1139,26 +1139,26 @@ GEMM_EX_LAUNCHER_USM(sycl::half, sycl::half, sycl::half, CUDA_R_16F, CUDA_R_16F, #undef GEMM_EX_LAUNCHER_USM -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const bfloat16* a, int64_t lda, const bfloat16* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", "for row_major layout"); } template -inline sycl::event symm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event symm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "symm", "for row_major layout"); } #define SYMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return symm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \ a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1171,18 +1171,18 @@ SYMM_LAUNCHER_USM(std::complex, cublasZsymm) #undef SYMM_LAUNCHER_USM template -inline sycl::event hemm(const char *func_name, Func func, sycl::queue &queue, side left_right, - uplo upper_lower, int64_t m, int64_t n, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event hemm(const char* func_name, Func func, sycl::queue& queue, side left_right, + uplo upper_lower, int64_t m, int64_t n, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "hemm", "for row_major layout"); } #define HEMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return hemm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, \ a, lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1192,16 +1192,16 @@ HEMM_LAUNCHER_USM(std::complex, cublasZhemm) #undef HEMM_LAUNCHER_USM template -inline sycl::event syrk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event syrk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "syrk", "for row_major layout"); } #define SYRK_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syrk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, beta, c, ldc, dependencies); \ } @@ -1214,18 +1214,18 @@ SYRK_LAUNCHER_USM(std::complex, cublasZsyrk) #undef SYRK_LAUNCHER_USM template -inline sycl::event herk(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline sycl::event herk(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, const ScalarType alpha, - const DataType *a, int64_t lda, const ScalarType beta, DataType *c, - int64_t ldc, const std::vector &dependencies) { + const DataType* a, int64_t lda, const ScalarType beta, DataType* c, + int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "herk", "for row_major layout"); } #define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ - const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const SCALAR_TYPE alpha, const DATA_TYPE* a, int64_t lda, \ + const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return herk(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, beta, c, ldc, dependencies); \ } @@ -1236,18 +1236,18 @@ HERK_LAUNCHER_USM(std::complex, double, cublasZherk) #undef HERK_LAUNCHER_USM template -inline sycl::event syr2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, - transpose trans, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syr2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, + transpose trans, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "syr2k", "for row_major layout"); } #define SYR2K_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syr2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1259,19 +1259,19 @@ SYR2K_LAUNCHER_USM(std::complex, cublasZsyr2k) #undef SYR2K_LAUNCHER_USM template -inline sycl::event her2k(const char *func_name, Func func, sycl::queue &queue, uplo upper_lower, +inline sycl::event her2k(const char* func_name, Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, const DataType alpha, - const DataType *a, int64_t lda, const DataType *b, int64_t ldb, - const ScalarType beta, DataType *c, int64_t ldc, - const std::vector &dependencies) { + const DataType* a, int64_t lda, const DataType* b, int64_t ldb, + const ScalarType beta, DataType* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "her2k", "for row_major layout"); } #define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, CUBLAS_ROUTINE) \ - sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b, \ - int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const DATA_TYPE alpha, const DATA_TYPE* a, int64_t lda, const DATA_TYPE* b, \ + int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return her2k(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, \ lda, b, ldb, beta, c, ldc, dependencies); \ } @@ -1286,17 +1286,17 @@ HER2K_LAUNCHER_USM(std::complex, double, cublasZher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline sycl::event trmm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline sycl::event trmm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { + T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "trmm", "for row_major layout"); } #define TRMM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trmm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); \ } @@ -1308,17 +1308,17 @@ TRMM_LAUNCHER_USM(std::complex, cublasZtrmm) #undef TRMM_LAUNCHER_USM template -inline sycl::event trsm(const char *func_name, Func func, sycl::queue &queue, side left_right, +inline sycl::event trsm(const char* func_name, Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { + T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "trsm", "for row_major layout"); } #define TRSM_LAUNCHER_USM(TYPE, CUBLAS_ROUTINE) \ - sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trsm(#CUBLAS_ROUTINE, CUBLAS_ROUTINE, queue, left_right, upper_lower, trans, \ unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); \ } diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp index 1f84de776..758b2e14d 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.cpp +++ b/src/blas/backends/cublas/cublas_scope_handle.cpp @@ -43,7 +43,7 @@ thread_local cublas_handle CublasScopedContextHandler::handle_helper cublas_handle{}; #endif -CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih) +CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih) : ih(ih), needToRecover_(false) { placedContext_ = new sycl::context(queue.get_context()); @@ -73,8 +73,8 @@ CublasScopedContextHandler::~CublasScopedContextHandler() noexcept(false) { delete placedContext_; } -void ContextCallback(void *userData) { - auto *ptr = static_cast *>(userData); +void ContextCallback(void* userData) { + auto* ptr = static_cast*>(userData); if (!ptr) { return; } @@ -92,7 +92,7 @@ void ContextCallback(void *userData) { } } -cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) { +cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue& queue) { auto cudaDevice = ih.get_native_device(); CUresult cuErr; CUcontext desired; @@ -139,10 +139,10 @@ cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) return handle; } -CUstream CublasScopedContextHandler::get_stream(const sycl::queue &queue) { +CUstream CublasScopedContextHandler::get_stream(const sycl::queue& queue) { return sycl::get_native(queue); } -sycl::context CublasScopedContextHandler::get_context(const sycl::queue &queue) { +sycl::context CublasScopedContextHandler::get_context(const sycl::queue& queue) { return queue.get_context(); } diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp index 341316d2f..4f5a58ee3 100644 --- a/src/blas/backends/cublas/cublas_scope_handle.hpp +++ b/src/blas/backends/cublas/cublas_scope_handle.hpp @@ -85,19 +85,19 @@ the handle must be destroyed when the context goes out of scope. This will bind class CublasScopedContextHandler { CUcontext original_; - sycl::context *placedContext_; + sycl::context* placedContext_; bool needToRecover_; - sycl::interop_handle &ih; + sycl::interop_handle& ih; #ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED static thread_local cublas_handle handle_helper; #else static thread_local cublas_handle handle_helper; #endif - CUstream get_stream(const sycl::queue &queue); - sycl::context get_context(const sycl::queue &queue); + CUstream get_stream(const sycl::queue& queue); + sycl::context get_context(const sycl::queue& queue); public: - CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); ~CublasScopedContextHandler() noexcept(false); /** @@ -107,7 +107,7 @@ class CublasScopedContextHandler { * @param queue sycl queue. * @return cublasHandle_t a handle to construct cublas routines */ - cublasHandle_t get_handle(const sycl::queue &queue); + cublasHandle_t get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. template @@ -116,7 +116,7 @@ class CublasScopedContextHandler { return reinterpret_cast(cudaPtr); } - void wait_stream(const sycl::queue &queue) { + void wait_stream(const sycl::queue& queue) { cuStreamSynchronize(get_stream(queue)); } }; diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp index 20675c212..03c282aed 100644 --- a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp +++ b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.cpp @@ -26,10 +26,10 @@ namespace cublas { thread_local cublas_handle CublasScopedContextHandler::handle_helper = cublas_handle{}; -CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih) +CublasScopedContextHandler::CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih) : interop_h(ih) {} -cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) { +cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue& queue) { sycl::device device = queue.get_device(); int current_device = interop_h.get_native_device(); CUstream streamId = get_stream(queue); @@ -64,7 +64,7 @@ cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue &queue) return handle; } -CUstream CublasScopedContextHandler::get_stream(const sycl::queue &queue) { +CUstream CublasScopedContextHandler::get_stream(const sycl::queue& queue) { return interop_h.get_native_queue(); } diff --git a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp index c7ec3e520..9e1eb89e5 100644 --- a/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp +++ b/src/blas/backends/cublas/cublas_scope_handle_hipsycl.hpp @@ -61,13 +61,13 @@ the handle must be destroyed when the context goes out of scope. This will bind class CublasScopedContextHandler { sycl::interop_handle interop_h; static thread_local cublas_handle handle_helper; - sycl::context get_context(const sycl::queue &queue); - CUstream get_stream(const sycl::queue &queue); + sycl::context get_context(const sycl::queue& queue); + CUstream get_stream(const sycl::queue& queue); public: - CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + CublasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); - cublasHandle_t get_handle(const sycl::queue &queue); + cublasHandle_t get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. diff --git a/src/blas/backends/cublas/cublas_task.hpp b/src/blas/backends/cublas/cublas_task.hpp index a486aafee..fa522bfc3 100644 --- a/src/blas/backends/cublas/cublas_task.hpp +++ b/src/blas/backends/cublas/cublas_task.hpp @@ -58,7 +58,7 @@ namespace cublas { #ifdef __HIPSYCL__ template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.hipSYCL_enqueue_custom_operation([f, queue](sycl::interop_handle ih) { auto sc = CublasScopedContextHandler(queue, ih); f(sc); @@ -66,7 +66,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } #else template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.host_task([f, queue](sycl::interop_handle ih) { auto sc = CublasScopedContextHandler(queue, ih); f(sc); @@ -74,7 +74,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } #endif template -static inline void onemkl_cublas_host_task(H &cgh, sycl::queue queue, F f) { +static inline void onemkl_cublas_host_task(H& cgh, sycl::queue queue, F f) { (void)host_task_internal(cgh, queue, f); } diff --git a/src/blas/backends/mkl_common/mkl_batch.cxx b/src/blas/backends/mkl_common/mkl_batch.cxx index 6358a3922..4bd9076b8 100644 --- a/src/blas/backends/mkl_common/mkl_batch.cxx +++ b/src/blas/backends/mkl_common/mkl_batch.cxx @@ -19,347 +19,347 @@ // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - std::int64_t stridex, sycl::buffer &y, int64_t incy, std::int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + std::int64_t stridex, sycl::buffer& y, int64_t incy, std::int64_t stridey, std::int64_t batch_size) { blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - std::int64_t stridex, sycl::buffer &y, int64_t incy, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + std::int64_t stridex, sycl::buffer& y, int64_t incy, std::int64_t stridey, std::int64_t batch_size) { blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, int64_t incy, std::int64_t stridey, std::int64_t batch_size) { blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, int64_t incy, std::int64_t stridey, std::int64_t batch_size) { blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, float beta, sycl::buffer &y, int64_t incy, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, float beta, sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, double beta, + sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, sycl::buffer, 1> &y, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size); } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, + sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, sycl::buffer &x, - int64_t incx, int64_t stride_x, sycl::buffer &c, int64_t ldc, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size); } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, double beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, double beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, - int64_t stride_b, std::complex beta, sycl::buffer, 1> &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, + int64_t stride_b, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { throw unimplemented("blas", "gemm_batch", "unsupported dtype combination: int8_t, int8_t, float, float"); } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); @@ -367,641 +367,641 @@ void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64 // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t incx, - std::int64_t stridex, float *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, + std::int64_t stridex, float* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + std::int64_t stridex, double* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::int64_t stridex, std::complex *y, int64_t incy, +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, float **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, double **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, int64_t *incx, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { return blas_major::copy_batch(queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - int64_t stridex, float *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::axpy_batch(queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float beta, float *y, int64_t incy, int64_t stride_y, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float beta, float* y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double beta, double *y, int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, const double* x, + int64_t incx, int64_t stride_x, double beta, double* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, stride_a, x, incx, stride_x, beta, y, incy, stride_y, batch_size, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, const float **x, int64_t *incx, float *beta, - float **y, int64_t *incy, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, groupsize, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double *beta, double **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, groupsize, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, groupsize, dependencies); } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::gemv_batch(queue, transa, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, groupsize, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const float *a, - int64_t lda, int64_t stride_a, const float *x, int64_t incx, - int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, const double *a, - int64_t lda, int64_t stride_a, const double *x, int64_t incx, - int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stride_a, const double* x, int64_t incx, + int64_t stride_x, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, stride_a, x, incx, stride_x, c, ldc, stride_c, batch_size, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, groupsize, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, double **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, groupsize, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, groupsize, dependencies); } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, const std::complex **x, - int64_t *incx, std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::dgmm_batch(queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, groupsize, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a, - const float *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, int64_t stride_a, + const float* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a, - const double *b, int64_t ldb, int64_t stride_b, double beta, double *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, int64_t stride_a, + const double* b, int64_t ldb, int64_t stride_b, double beta, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, const std::complex *b, int64_t ldb, - int64_t stride_b, std::complex beta, std::complex *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, const std::complex *b, int64_t ldb, - int64_t stride_b, std::complex beta, std::complex *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, - int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, + int64_t stride_a, const sycl::half* b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::half* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a, - const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, int64_t stride_a, + const sycl::half* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", "unsupported dtype combination: int8_t, int8_t, float, float"); } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, - std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, + std::int32_t* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda, - const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const float** a, int64_t* lda, + const float** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda, - const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, double* alpha, const double** a, int64_t* lda, + const double** b, int64_t* ldb, double* beta, double** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, const std::complex **b, - int64_t *ldb, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, const std::complex **b, - int64_t *ldb, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a, - int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta, - sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, sycl::half* alpha, const sycl::half** a, + int64_t* lda, const sycl::half** b, int64_t* ldb, sycl::half* beta, + sycl::half** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda, - const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const sycl::half** a, int64_t* lda, + const sycl::half** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", "unsupported dtype combination: int8_t, int8_t, float, float"); } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, std::int32_t** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::gemm_batch(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, int64_t stride_a, float *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, int64_t stride_a, double *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - std::complex *b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, float *alpha, const float **a, - int64_t *lda, float **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, float* alpha, const float** a, + int64_t* lda, float** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, double *alpha, const double **a, - int64_t *lda, double **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, double* alpha, const double** a, + int64_t* lda, double** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex **b, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, transpose *trans, - diag *unit_diag, int64_t *m, int64_t *n, std::complex *alpha, - const std::complex **a, int64_t *lda, std::complex **b, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::trsm_batch(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, int64_t stride_a, float beta, - float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, int64_t stride_a, double beta, - double *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex beta, std::complex *c, +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { return blas_major::syrk_batch(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { return blas_major::omatcopy_batch(queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::imatcopy_batch(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { return blas_major::omatadd_batch(queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } diff --git a/src/blas/backends/mkl_common/mkl_blas_backend.hxx b/src/blas/backends/mkl_common/mkl_blas_backend.hxx index 10e441bd7..ca0c036f1 100644 --- a/src/blas/backends/mkl_common/mkl_blas_backend.hxx +++ b/src/blas/backends/mkl_common/mkl_blas_backend.hxx @@ -19,1351 +19,1365 @@ /// level3, buffer -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, + std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc); - -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc); - -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); - -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); - -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); - -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); + +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb); +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); // level 3, USM -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, value_or_pointer beta, - float *c, std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, value_or_pointer beta, + float* c, std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, value_or_pointer beta, - double *c, std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, value_or_pointer beta, + double* c, std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer> beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, + const std::complex* a, std::int64_t lda, const std::complex* b, std::int64_t ldb, value_or_pointer> beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - value_or_pointer beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, value_or_pointer beta, - float *c, std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + value_or_pointer beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, + value_or_pointer beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, + value_or_pointer beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, + value_or_pointer beta, bfloat16* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const bfloat16 *a, std::int64_t lda, const bfloat16 *b, - std::int64_t ldb, value_or_pointer beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::int8_t* a, std::int64_t lda, const std::int8_t* b, std::int64_t ldb, + value_or_pointer beta, std::int32_t* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const bfloat16 *a, std::int64_t lda, const bfloat16 *b, - std::int64_t ldb, value_or_pointer beta, bfloat16 *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const std::int8_t *a, - std::int64_t lda, const std::int8_t *b, std::int64_t ldb, - value_or_pointer beta, std::int32_t *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const std::int8_t *a, - std::int64_t lda, const std::int8_t *b, std::int64_t ldb, - value_or_pointer beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, value_or_pointer beta, float *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, value_or_pointer beta, double *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, + const std::int8_t* a, std::int64_t lda, const std::int8_t* b, std::int64_t ldb, + value_or_pointer beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, value_or_pointer beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, value_or_pointer beta, double* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer> beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, + const std::complex* a, std::int64_t lda, const std::complex* b, std::int64_t ldb, value_or_pointer> beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer> beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, + const std::complex* a, std::int64_t lda, const std::complex* b, std::int64_t ldb, value_or_pointer> beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const float *a, std::int64_t lda, - value_or_pointer beta, float *c, std::int64_t ldc, - const std::vector &dependencies = {}); +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const float* a, std::int64_t lda, + value_or_pointer beta, float* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const double *a, std::int64_t lda, - value_or_pointer beta, double *c, std::int64_t ldc, - const std::vector &dependencies = {}); +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const double* a, std::int64_t lda, + value_or_pointer beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, - value_or_pointer> beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + value_or_pointer> beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, - value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const std::complex *a, - std::int64_t lda, value_or_pointer beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const std::complex *a, - std::int64_t lda, value_or_pointer beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, value_or_pointer beta, float *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, value_or_pointer beta, double *c, - std::int64_t ldc, const std::vector &dependencies = {}); - -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + const std::complex* a, std::int64_t lda, + value_or_pointer> beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const std::complex* a, + std::int64_t lda, value_or_pointer beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const std::complex* a, + std::int64_t lda, value_or_pointer beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, value_or_pointer beta, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, value_or_pointer beta, double* c, + std::int64_t ldc, const std::vector& dependencies = {}); + +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer> beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, + const std::complex* a, std::int64_t lda, const std::complex* b, std::int64_t ldb, value_or_pointer> beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies = {}); + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); // level 2, buffer -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy); +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, sycl::buffer& a, std::int64_t lda); -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); - -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - -void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); - -void her(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); - -void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - -void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); + +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); + +void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + +void her(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + +void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); -void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); -void hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); +void hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); -void hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); +void hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); -void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); -void sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); -void symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, std::int64_t lda); +void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda); -void syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); +void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); -void syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda); +void syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); -void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); +void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); -void spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); +void spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); -void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a); -void spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a); +void spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a); -void spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a); +void spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - std::int64_t k, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + std::int64_t k, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx); +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); // level 2, USM -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, value_or_pointer beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, value_or_pointer beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, value_or_pointer alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, value_or_pointer beta, - float *y, std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, value_or_pointer alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, value_or_pointer beta, - double *y, std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, value_or_pointer beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, value_or_pointer beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, value_or_pointer alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, value_or_pointer beta, + float* y, std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, value_or_pointer alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, + value_or_pointer beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, value_or_pointer> beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, + const std::complex* a, std::int64_t lda, const std::complex* x, std::int64_t incx, value_or_pointer> beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hemv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event her(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event her2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, const std::vector &dependencies = {}); - -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *a, const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - value_or_pointer alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, value_or_pointer beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, std::int64_t k, - value_or_pointer alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, value_or_pointer beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *a, std::int64_t lda, const float *x, std::int64_t incx, - value_or_pointer beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event symv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, - value_or_pointer beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event syr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a, - std::int64_t lda, const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *a, const float *x, std::int64_t incx, value_or_pointer beta, - float *y, std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event spmv(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *a, const double *x, std::int64_t incx, value_or_pointer beta, - double *y, std::int64_t incy, const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, float *a, - const std::vector &dependencies = {}); - -sycl::event spr(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, double *a, - const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, float *a, - const std::vector &dependencies = {}); - -sycl::event spr2(sycl::queue &queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, const double *y, std::int64_t incy, double *a, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); - -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies = {}); + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, value_or_pointer alpha, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, float* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, value_or_pointer alpha, + const double* x, std::int64_t incx, const double* y, std::int64_t incy, double* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hemv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event her(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event her2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies = {}); + +sycl::event hpr(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, const std::vector& dependencies = {}); + +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* a, const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + value_or_pointer alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, value_or_pointer beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, std::int64_t k, + value_or_pointer alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, value_or_pointer beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, value_or_pointer beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event symv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, value_or_pointer beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, + const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* x, std::int64_t incx, double* a, + std::int64_t lda, const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event syr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const float* a, const float* x, std::int64_t incx, + value_or_pointer beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event spmv(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* a, const double* x, + std::int64_t incx, value_or_pointer beta, double* y, std::int64_t incy, + const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, value_or_pointer alpha, + const float* x, std::int64_t incx, float* a, + const std::vector& dependencies = {}); + +sycl::event spr(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* a, const std::vector& dependencies = {}); + +sycl::event spr2(sycl::queue& queue, uplo upper_lower, std::int64_t n, + value_or_pointer alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); + +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies = {}); // level 1, buffer -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result, index_base base=index_base::zero); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result, index_base base = index_base::zero); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result, index_base base=index_base::zero); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result, index_base base = index_base::zero); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result, - index_base base=index_base::zero); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result, + index_base base = index_base::zero); -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result, - index_base base=index_base::zero); +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result, + index_base base = index_base::zero); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result, index_base base=index_base::zero); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result, index_base base = index_base::zero); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result, index_base base=index_base::zero); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result, index_base base = index_base::zero); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result, - index_base base=index_base::zero); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result, + index_base base = index_base::zero); -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result, - index_base base=index_base::zero); +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result, + index_base base = index_base::zero); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); +void sdsdot(sycl::queue& queue, std::int64_t n, float sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result); +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result); +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, float c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, float c, float s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, double c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, double c, double s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, float c, float s); +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, float c, float s); -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, double c, double s); +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, double c, double s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s); +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param); -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m); +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param); -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, sycl::buffer ¶m); +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param); -void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer, 1>& x, std::int64_t incx); -void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); +void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy); +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy); // level 1, USM -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies = {}); +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies = {}); +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies = {}); +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies = {}); -sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies = {}); +sycl::event asum(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer> alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, std::int64_t n, value_or_pointer> alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event axpy(sycl::queue &queue, std::int64_t n, value_or_pointer> alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event axpy(sycl::queue& queue, std::int64_t n, value_or_pointer> alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, const float *x, - std::int64_t incx, value_or_pointer beta, float *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, const float* x, + std::int64_t incx, value_or_pointer beta, float* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, const double *x, - std::int64_t incx, value_or_pointer beta, double *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, + const double* x, std::int64_t incx, value_or_pointer beta, double* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer> alpha, - const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, std::int64_t n, value_or_pointer> alpha, + const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event axpby(sycl::queue &queue, std::int64_t n, value_or_pointer> alpha, - const std::complex *x, std::int64_t incx, - value_or_pointer> beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event axpby(sycl::queue& queue, std::int64_t n, value_or_pointer> alpha, + const std::complex* x, std::int64_t incx, + value_or_pointer> beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *result, - const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies = {}); +sycl::event sdsdot(sycl::queue& queue, std::int64_t n, float sb, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies = {}); -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, double *result, - const std::vector &dependencies = {}); +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, - const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, + const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, - const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, + const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies = {}); -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies = {}); +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, value_or_pointer c, - value_or_pointer s, const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, value_or_pointer c, + value_or_pointer s, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, value_or_pointer c, - value_or_pointer s, const std::vector &dependencies = {}); +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, value_or_pointer c, + value_or_pointer s, const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, +sycl::event rot(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, std::int64_t incy, value_or_pointer c, value_or_pointer s, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, +sycl::event rot(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, std::int64_t incy, value_or_pointer c, value_or_pointer s, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, - const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, float *c, - std::complex *s, const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies = {}); -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, double *c, - std::complex *s, const std::vector &dependencies = {}); +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, const float *param, - const std::vector &dependencies = {}); +sycl::event rotm(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, const float* param, + const std::vector& dependencies = {}); -sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, const double *param, - const std::vector &dependencies = {}); +sycl::event rotm(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, const double* param, + const std::vector& dependencies = {}); -sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, value_or_pointer y1, - float *param, const std::vector &dependencies = {}); +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, value_or_pointer y1, + float* param, const std::vector& dependencies = {}); -sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, value_or_pointer y1, - double *param, const std::vector &dependencies = {}); +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, + value_or_pointer y1, double* param, + const std::vector& dependencies = {}); #define ONEMKL_DECLARE_SCAL(T, Ts) \ - sycl::event scal(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, T *x, \ - std::int64_t incx, const std::vector &dependencies = {}); + sycl::event scal(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, T* x, \ + std::int64_t incx, const std::vector& dependencies = {}); ONEMKL_DECLARE_SCAL(float, float) ONEMKL_DECLARE_SCAL(double, double) @@ -1371,1124 +1385,1134 @@ ONEMKL_DECLARE_SCAL(std::complex, std::complex) ONEMKL_DECLARE_SCAL(std::complex, std::complex) ONEMKL_DECLARE_SCAL(std::complex, float) ONEMKL_DECLARE_SCAL(std::complex, double) -sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); -sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies = {}); +sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); +sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies = {}); #undef ONEMKL_DECLARE_SCAL -sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies = {}); +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies = {}); // extensions, buffer -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc); +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int8_t ao, - sycl::buffer &b, std::int64_t ldb, std::uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, std::int8_t ao, + sycl::buffer& b, std::int64_t ldb, std::uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int8_t ao, - sycl::buffer &b, std::int64_t ldb, std::int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, std::int8_t ao, + sycl::buffer& b, std::int64_t ldb, std::int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::uint8_t ao, - sycl::buffer &b, std::int64_t ldb, std::int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, std::uint8_t ao, + sycl::buffer& b, std::int64_t ldb, std::int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::uint8_t ao, - sycl::buffer &b, std::int64_t ldb, std::uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, std::uint8_t ao, + sycl::buffer& b, std::int64_t ldb, std::uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); // extensions, USM -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, value_or_pointer beta, - float *c, std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, value_or_pointer beta, + float* c, std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, value_or_pointer beta, - double *c, std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + std::int64_t n, std::int64_t k, value_or_pointer alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + value_or_pointer beta, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, value_or_pointer> beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, value_or_pointer> beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, + const std::complex* a, std::int64_t lda, const std::complex* b, std::int64_t ldb, value_or_pointer> beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, value_or_pointer beta, std::int32_t *c, - std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, value_or_pointer beta, + std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, value_or_pointer beta, std::int32_t *c, - std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies = {}); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, value_or_pointer beta, + std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies = {}); -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - value_or_pointer beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + value_or_pointer beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, - const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - value_or_pointer beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies = {}); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, + const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + value_or_pointer beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies = {}); // batch, buffer -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, float beta, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, sycl::buffer, 1> &c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, std::int64_t incy, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, std::int64_t incy, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, float beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, double beta, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, double beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -void dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, double beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer &c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, +void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, +void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &ab, std::int64_t lda, std::int64_t ldb, +void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -void imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb); +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &b, std::int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &b, std::int64_t ldb, std::int64_t strideb); +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, std::int64_t ldb, +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb); +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, std::int64_t lda, +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // batch, usm -sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans, - const std::int64_t *n, const std::int64_t *k, const float *alpha, - const float **a, const std::int64_t *lda, const float *beta, float **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans, - const std::int64_t *n, const std::int64_t *k, const double *alpha, - const double **a, const std::int64_t *lda, const double *beta, double **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans, - const std::int64_t *n, const std::int64_t *k, - const std::complex *alpha, const std::complex **a, - const std::int64_t *lda, const std::complex *beta, - std::complex **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, const uplo *upper_lower, const transpose *trans, - const std::int64_t *n, const std::int64_t *k, - const std::complex *alpha, const std::complex **a, - const std::int64_t *lda, const std::complex *beta, - std::complex **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, value_or_pointer beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, value_or_pointer alpha, const double *a, +sycl::event syrk_batch(sycl::queue& queue, const uplo* upper_lower, const transpose* trans, + const std::int64_t* n, const std::int64_t* k, const float* alpha, + const float** a, const std::int64_t* lda, const float* beta, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, const uplo* upper_lower, const transpose* trans, + const std::int64_t* n, const std::int64_t* k, const double* alpha, + const double** a, const std::int64_t* lda, const double* beta, double** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, const uplo* upper_lower, const transpose* trans, + const std::int64_t* n, const std::int64_t* k, + const std::complex* alpha, const std::complex** a, + const std::int64_t* lda, const std::complex* beta, + std::complex** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, const uplo* upper_lower, const transpose* trans, + const std::int64_t* n, const std::int64_t* k, + const std::complex* alpha, const std::complex** a, + const std::int64_t* lda, const std::complex* beta, + std::complex** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, value_or_pointer beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, value_or_pointer alpha, const double* a, std::int64_t lda, std::int64_t stride_a, value_or_pointer beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - value_or_pointer> beta, std::complex *c, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + value_or_pointer> beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, + const std::vector& dependencies = {}); +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - value_or_pointer> beta, std::complex *c, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + value_or_pointer> beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const float **x, - const std::int64_t *incx, float **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const double **x, - const std::int64_t *incx, double **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex **x, - const std::int64_t *incx, std::complex **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event copy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex **x, - const std::int64_t *incx, std::complex **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float *c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, const std::int64_t* n, const float** x, + const std::int64_t* incx, float** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, const std::int64_t* n, const double** x, + const std::int64_t* incx, double** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, const std::int64_t* n, const std::complex** x, + const std::int64_t* incx, std::complex** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event copy_batch(sycl::queue& queue, const std::int64_t* n, const std::complex** x, + const std::int64_t* incx, std::complex** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const float* a, std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double *c, std::int64_t ldc, +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const double* a, std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m, - const std::int64_t *n, const float **a, const std::int64_t *lda, - const float **x, const std::int64_t *incx, float **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m, - const std::int64_t *n, const double **a, const std::int64_t *lda, - const double **x, const std::int64_t *incx, double **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m, - const std::int64_t *n, const std::complex **a, - const std::int64_t *lda, const std::complex **x, - const std::int64_t *incx, std::complex **c, const std::int64_t *ldc, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event dgmm_batch(sycl::queue &queue, const side *left_right, const std::int64_t *m, - const std::int64_t *n, const std::complex **a, - const std::int64_t *lda, const std::complex **x, - const std::int64_t *incx, std::complex **c, const std::int64_t *ldc, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, value_or_pointer beta, float *y, std::int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, value_or_pointer beta, double *y, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, const side* left_right, const std::int64_t* m, + const std::int64_t* n, const float** a, const std::int64_t* lda, + const float** x, const std::int64_t* incx, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, const side* left_right, const std::int64_t* m, + const std::int64_t* n, const double** a, const std::int64_t* lda, + const double** x, const std::int64_t* incx, double** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, const side* left_right, const std::int64_t* m, + const std::int64_t* n, const std::complex** a, + const std::int64_t* lda, const std::complex** x, + const std::int64_t* incx, std::complex** c, const std::int64_t* ldc, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event dgmm_batch(sycl::queue& queue, const side* left_right, const std::int64_t* m, + const std::int64_t* n, const std::complex** a, + const std::int64_t* lda, const std::complex** x, + const std::int64_t* incx, std::complex** c, const std::int64_t* ldc, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, value_or_pointer beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, value_or_pointer beta, double* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, - value_or_pointer> beta, std::complex *y, + value_or_pointer> beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemv_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, +sycl::event gemv_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, - value_or_pointer> beta, std::complex *y, + value_or_pointer> beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const float *alpha, const float **a, - const std::int64_t *lda, const float **x, const std::int64_t *incx, - const float *beta, float **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const double *alpha, const double **a, - const std::int64_t *lda, const double **x, const std::int64_t *incx, - const double *beta, double **y, const std::int64_t *incy, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - const std::complex **x, const std::int64_t *incx, - const std::complex *beta, std::complex **y, - const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event gemv_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - const std::complex **x, const std::int64_t *incx, - const std::complex *beta, std::complex **y, - const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const double *alpha, - const double **x, const std::int64_t *incx, double **y, - const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const float *alpha, - const float **x, const std::int64_t *incx, float **y, - const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex *alpha, - const std::complex **x, const std::int64_t *incx, - std::complex **y, const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, const std::int64_t *n, const std::complex *alpha, - const std::complex **x, const std::int64_t *incx, - std::complex **y, const std::int64_t *incy, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const float* alpha, const float** a, + const std::int64_t* lda, const float** x, const std::int64_t* incx, + const float* beta, float** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const double* alpha, const double** a, + const std::int64_t* lda, const double** x, const std::int64_t* incx, + const double* beta, double** y, const std::int64_t* incy, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + const std::complex** x, const std::int64_t* incx, + const std::complex* beta, std::complex** y, + const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event gemv_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + const std::complex** x, const std::int64_t* incx, + const std::complex* beta, std::complex** y, + const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, const std::int64_t* n, const double* alpha, + const double** x, const std::int64_t* incx, double** y, + const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, const std::int64_t* n, const float* alpha, + const float** x, const std::int64_t* incx, float** y, + const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, const std::int64_t* n, const std::complex* alpha, + const std::complex** x, const std::int64_t* incx, + std::complex** y, const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, const std::int64_t* n, const std::complex* alpha, + const std::complex** x, const std::int64_t* incx, + std::complex** y, const std::int64_t* incy, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, value_or_pointer alpha, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, value_or_pointer> alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, - value_or_pointer> alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const float **a, const std::int64_t *lda, - const float **b, const std::int64_t *ldb, const float *beta, float **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const double *alpha, const double **a, const std::int64_t *lda, - const double **b, const std::int64_t *ldb, const double *beta, double **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const std::complex *alpha, const std::complex **a, - const std::int64_t *lda, const std::complex **b, - const std::int64_t *ldb, const std::complex *beta, - std::complex **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const std::complex *alpha, const std::complex **a, - const std::int64_t *lda, const std::complex **b, - const std::int64_t *ldb, const std::complex *beta, - std::complex **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const sycl::half *alpha, const sycl::half **a, const std::int64_t *lda, - const sycl::half **b, const std::int64_t *ldb, const sycl::half *beta, - sycl::half **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const sycl::half **a, const std::int64_t *lda, - const sycl::half **b, const std::int64_t *ldb, const float *beta, float **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const bfloat16 **a, const std::int64_t *lda, - const bfloat16 **b, const std::int64_t *ldb, const float *beta, - bfloat16 **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const bfloat16 **a, const std::int64_t *lda, - const bfloat16 **b, const std::int64_t *ldb, const float *beta, - float **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const std::int8_t **a, const std::int64_t *lda, - const std::int8_t **b, const std::int64_t *ldb, const float *beta, - std::int32_t **c, const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, const transpose *transa, const transpose *transb, - const std::int64_t *m, const std::int64_t *n, const std::int64_t *k, - const float *alpha, const std::int8_t **a, const std::int64_t *lda, - const std::int8_t **b, const std::int64_t *ldb, const float *beta, float **c, - const std::int64_t *ldc, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb, - std::int64_t stride_b, value_or_pointer beta, float *c, std::int64_t ldc, - std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, value_or_pointer alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb, - std::int64_t stride_b, value_or_pointer beta, double *c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, + value_or_pointer> alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, + std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const float** a, const std::int64_t* lda, + const float** b, const std::int64_t* ldb, const float* beta, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const double* alpha, const double** a, const std::int64_t* lda, + const double** b, const std::int64_t* ldb, const double* beta, double** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const std::complex* alpha, const std::complex** a, + const std::int64_t* lda, const std::complex** b, + const std::int64_t* ldb, const std::complex* beta, + std::complex** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const std::complex* alpha, const std::complex** a, + const std::int64_t* lda, const std::complex** b, + const std::int64_t* ldb, const std::complex* beta, + std::complex** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const sycl::half* alpha, const sycl::half** a, const std::int64_t* lda, + const sycl::half** b, const std::int64_t* ldb, const sycl::half* beta, + sycl::half** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const sycl::half** a, const std::int64_t* lda, + const sycl::half** b, const std::int64_t* ldb, const float* beta, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const bfloat16** a, const std::int64_t* lda, + const bfloat16** b, const std::int64_t* ldb, const float* beta, bfloat16** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const bfloat16** a, const std::int64_t* lda, + const bfloat16** b, const std::int64_t* ldb, const float* beta, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const std::int8_t** a, const std::int64_t* lda, + const std::int8_t** b, const std::int64_t* ldb, const float* beta, + std::int32_t** c, const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, const transpose* transa, const transpose* transb, + const std::int64_t* m, const std::int64_t* n, const std::int64_t* k, + const float* alpha, const std::int8_t** a, const std::int64_t* lda, + const std::int8_t** b, const std::int64_t* ldb, const float* beta, float** c, + const std::int64_t* ldc, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, + const float* a, std::int64_t lda, std::int64_t stride_a, const float* b, + std::int64_t ldb, std::int64_t stride_b, value_or_pointer beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, value_or_pointer alpha, + const double* a, std::int64_t lda, std::int64_t stride_a, const double* b, + std::int64_t ldb, std::int64_t stride_b, value_or_pointer beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer> beta, std::complex *c, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer> beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer> beta, std::complex *c, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer> beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, sycl::half *c, std::int64_t ldc, + const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); + const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const bfloat16 *a, std::int64_t lda, std::int64_t stride_a, - const bfloat16 *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, bfloat16 *c, std::int64_t ldc, + const bfloat16* a, std::int64_t lda, std::int64_t stride_a, + const bfloat16* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, bfloat16* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const bfloat16 *a, std::int64_t lda, std::int64_t stride_a, - const bfloat16 *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); + const bfloat16* a, std::int64_t lda, std::int64_t stride_a, + const bfloat16* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, std::int32_t *c, std::int64_t ldc, + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, value_or_pointer alpha, - const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, - value_or_pointer beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, + value_or_pointer beta, float* c, std::int64_t ldc, + std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t m, std::int64_t n, value_or_pointer alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + value_or_pointer alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower, - const transpose *trans, const diag *unit_diag, const std::int64_t *m, - const std::int64_t *n, const float *alpha, const float **a, - const std::int64_t *lda, float **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower, - const transpose *trans, const diag *unit_diag, const std::int64_t *m, - const std::int64_t *n, const double *alpha, const double **a, - const std::int64_t *lda, double **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower, - const transpose *trans, const diag *unit_diag, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - std::complex **b, const std::int64_t *ldb, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event trsm_batch(sycl::queue &queue, const side *left_right, const uplo *upper_lower, - const transpose *trans, const diag *unit_diag, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - std::complex **b, const std::int64_t *ldb, std::int64_t group_count, - const std::int64_t *group_size, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, const side* left_right, const uplo* upper_lower, + const transpose* trans, const diag* unit_diag, const std::int64_t* m, + const std::int64_t* n, const float* alpha, const float** a, + const std::int64_t* lda, float** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, const side* left_right, const uplo* upper_lower, + const transpose* trans, const diag* unit_diag, const std::int64_t* m, + const std::int64_t* n, const double* alpha, const double** a, + const std::int64_t* lda, double** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, const side* left_right, const uplo* upper_lower, + const transpose* trans, const diag* unit_diag, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + std::complex** b, const std::int64_t* ldb, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event trsm_batch(sycl::queue& queue, const side* left_right, const uplo* upper_lower, + const transpose* trans, const diag* unit_diag, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + std::complex** b, const std::int64_t* ldb, std::int64_t group_count, + const std::int64_t* group_size, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, float *ab, std::int64_t lda, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, double *ab, std::int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, std::complex *ab, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, std::complex *ab, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const float *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const float* a, std::int64_t lda, std::int64_t stride_a, value_or_pointer beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const double *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const double* a, std::int64_t lda, std::int64_t stride_a, value_or_pointer beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - value_or_pointer> beta, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex *c, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + value_or_pointer> beta, const std::complex* b, + std::int64_t ldb, std::int64_t stride_b, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - value_or_pointer> beta, const std::complex *b, - std::int64_t ldb, std::int64_t stride_b, std::complex *c, - std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const float *a, std::int64_t lda, float *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const double *a, std::int64_t lda, double *b, - std::int64_t ldb, const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); - -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + value_or_pointer> beta, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, float* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, double* b, + std::int64_t ldb, const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies = {}); + +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, std::complex *b, +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer alpha, double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer alpha, double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies = {}); -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, std::complex *ab, +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - value_or_pointer> alpha, std::complex *ab, +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + value_or_pointer> alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const float *a, std::int64_t lda, - value_or_pointer beta, const float *b, std::int64_t ldb, float *c, - std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const float* a, std::int64_t lda, + value_or_pointer beta, const float* b, std::int64_t ldb, float* c, + std::int64_t ldc, const std::vector& dependencies = {}); -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, value_or_pointer alpha, const double *a, std::int64_t lda, - value_or_pointer beta, const double *b, std::int64_t ldb, double *c, - std::int64_t ldc, const std::vector &dependencies = {}); +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, value_or_pointer alpha, const double* a, + std::int64_t lda, value_or_pointer beta, const double* b, + std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, - value_or_pointer> beta, const std::complex *b, - std::int64_t ldb, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + value_or_pointer> beta, const std::complex* b, + std::int64_t ldb, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, value_or_pointer> alpha, - const std::complex *a, std::int64_t lda, - value_or_pointer> beta, const std::complex *b, - std::int64_t ldb, std::complex *c, std::int64_t ldc, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const float *alpha, const float **a, - const std::int64_t *lda, float **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const double *alpha, const double **a, - const std::int64_t *lda, double **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - std::complex **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event omatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - const std::complex **a, const std::int64_t *lda, - std::complex **b, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const float *alpha, float **ab, - const std::int64_t *lda, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const double *alpha, double **ab, - const std::int64_t *lda, const std::int64_t *ldb, - std::int64_t group_count, const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - std::complex **ab, const std::int64_t *lda, - const std::int64_t *ldb, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); - -sycl::event imatcopy_batch(sycl::queue &queue, const transpose *trans, const std::int64_t *m, - const std::int64_t *n, const std::complex *alpha, - std::complex **ab, const std::int64_t *lda, - const std::int64_t *ldb, std::int64_t group_count, - const std::int64_t *groupsize, - const std::vector &dependencies = {}); + const std::complex* a, std::int64_t lda, + value_or_pointer> beta, const std::complex* b, + std::int64_t ldb, std::complex* c, std::int64_t ldc, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const float* alpha, const float** a, + const std::int64_t* lda, float** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const double* alpha, const double** a, + const std::int64_t* lda, double** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + std::complex** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event omatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + const std::complex** a, const std::int64_t* lda, + std::complex** b, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event imatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const float* alpha, float** ab, + const std::int64_t* lda, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event imatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const double* alpha, double** ab, + const std::int64_t* lda, const std::int64_t* ldb, + std::int64_t group_count, const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event imatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + std::complex** ab, const std::int64_t* lda, + const std::int64_t* ldb, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); + +sycl::event imatcopy_batch(sycl::queue& queue, const transpose* trans, const std::int64_t* m, + const std::int64_t* n, const std::complex* alpha, + std::complex** ab, const std::int64_t* lda, + const std::int64_t* ldb, std::int64_t group_count, + const std::int64_t* groupsize, + const std::vector& dependencies = {}); diff --git a/src/blas/backends/mkl_common/mkl_extensions.cxx b/src/blas/backends/mkl_common/mkl_extensions.cxx index 4672af5c7..171e2251a 100644 --- a/src/blas/backends/mkl_common/mkl_extensions.cxx +++ b/src/blas/backends/mkl_common/mkl_extensions.cxx @@ -19,341 +19,341 @@ // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, float beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, double beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { return blas_major::gemm_bias(queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::gemmt(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, float *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, float* b, int64_t ldb, + const std::vector& dependencies) { return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies) { return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { return blas_major::omatcopy(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, std::int64_t stridea, float* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, std::int64_t stridea, double* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { return blas_major::imatcopy(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, float beta, const float *b, - int64_t ldb, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, float beta, const float* b, + int64_t ldb, float* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, double beta, const double *b, - int64_t ldb, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, double beta, const double* b, + int64_t ldb, double* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { return blas_major::omatadd(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } diff --git a/src/blas/backends/mkl_common/mkl_level1.cxx b/src/blas/backends/mkl_common/mkl_level1.cxx index 85ccb0025..d109282d8 100644 --- a/src/blas/backends/mkl_common/mkl_level1.cxx +++ b/src/blas/backends/mkl_common/mkl_level1.cxx @@ -19,627 +19,627 @@ // Buffer APIs -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::asum(queue, n, x, incx, result); } -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::asum(queue, n, x, incx, result); } -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::asum(queue, n, x, incx, result); } -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::asum(queue, n, x, incx, result); } -void axpy(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { blas_major::axpy(queue, n, alpha, x, incx, y, incy); } -void axpy(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { blas_major::axpy(queue, n, alpha, x, incx, y, incy); } -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::axpy(queue, n, alpha, x, incx, y, incy); } -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::axpy(queue, n, alpha, x, incx, y, incy); } -void axpby(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { blas_major::copy(queue, n, x, incx, y, incy); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { blas_major::copy(queue, n, x, incx, y, incy); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::copy(queue, n, x, incx, y, incy); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::copy(queue, n, x, incx, y, incy); } -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result) { blas_major::dot(queue, n, x, incx, y, incy, result); } -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result) { blas_major::dot(queue, n, x, incx, y, incy, result); } -void sdsdot(sycl::queue &queue, std::int64_t n, float sb, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void sdsdot(sycl::queue& queue, std::int64_t n, float sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { blas_major::sdsdot(queue, n, sb, x, incx, y, incy, result); } -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result) { blas_major::dot(queue, n, x, incx, y, incy, result); } -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { blas_major::dotc(queue, n, x, incx, y, incy, result); } -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { blas_major::dotc(queue, n, x, incx, y, incy, result); } -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { blas_major::dotu(queue, n, x, incx, y, incy, result); } -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { blas_major::dotu(queue, n, x, incx, y, incy, result); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::nrm2(queue, n, x, incx, result); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::nrm2(queue, n, x, incx, result); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::nrm2(queue, n, x, incx, result); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::nrm2(queue, n, x, incx, result); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, float c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { blas_major::rot(queue, n, x, incx, y, incy, c, s); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, double c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { blas_major::rot(queue, n, x, incx, y, incy, c, s); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, float c, float s) { blas_major::rot(queue, n, x, incx, y, incy, c, s); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, double c, double s) { blas_major::rot(queue, n, x, incx, y, incy, c, s); } -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { blas_major::rotg(queue, a, b, c, s); } -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { blas_major::rotg(queue, a, b, c, s); } -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { blas_major::rotg(queue, a, b, c, s); } -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { blas_major::rotg(queue, a, b, c, s); } -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m) { +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param) { blas_major::rotm(queue, n, x, incx, y, incy, param); } -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m) { +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param) { blas_major::rotm(queue, n, x, incx, y, incy, param); } -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, sycl::buffer ¶m) { +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param) { blas_major::rotmg(queue, d1, d2, x1, y1, param); } -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, sycl::buffer ¶m) { +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param) { blas_major::rotmg(queue, d1, d2, x1, y1, param); } -void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer, 1>& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::scal(queue, n, alpha, x, incx); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { blas_major::swap(queue, n, x, incx, y, incy); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { blas_major::swap(queue, n, x, incx, y, incy); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::swap(queue, n, x, incx, y, incy); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::swap(queue, n, x, incx, y, incy); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::iamax(queue, n, x, incx, result); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::iamax(queue, n, x, incx, result); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::iamax(queue, n, x, incx, result); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::iamax(queue, n, x, incx, result); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::iamin(queue, n, x, incx, result); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { blas_major::iamin(queue, n, x, incx, result); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::iamin(queue, n, x, incx, result); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { blas_major::iamin(queue, n, x, incx, result); } // USM APIs -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return blas_major::asum(queue, n, x, incx, result, dependencies); } -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return blas_major::asum(queue, n, x, incx, result, dependencies); } -sycl::event asum(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies) { return blas_major::asum(queue, n, x, incx, result, dependencies); } -sycl::event asum(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies) { return blas_major::asum(queue, n, x, incx, result, dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, - float *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, float alpha, const float* x, std::int64_t incx, + float* y, std::int64_t incy, const std::vector& dependencies) { return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::axpy(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::axpby(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, float *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::copy(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::copy(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::copy(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::copy(queue, n, x, incx, y, incy, dependencies); } -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dot(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event sdsdot(sycl::queue &queue, std::int64_t n, float sb, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, std::int64_t n, float sb, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { return blas_major::sdsdot(queue, n, sb, x, incx, y, incy, result, dependencies); } -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return blas_major::dot(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { return blas_major::dotc(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { return blas_major::dotc(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { return blas_major::dotu(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { return blas_major::dotu(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return blas_major::nrm2(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return blas_major::nrm2(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - float *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + float* result, const std::vector& dependencies) { return blas_major::nrm2(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - double *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + double* result, const std::vector& dependencies) { return blas_major::nrm2(queue, n, x, incx, result, dependencies); } -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float c, float s, const std::vector &dependencies) { +sycl::event rot(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, float c, float s, const std::vector& dependencies) { return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, +sycl::event rot(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { + const std::vector& dependencies) { return blas_major::rot(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { return blas_major::rotg(queue, a, b, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, - const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies) { return blas_major::rotg(queue, a, b, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, float *c, - std::complex *s, const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies) { return blas_major::rotg(queue, a, b, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, double *c, - std::complex *s, const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies) { return blas_major::rotg(queue, a, b, c, s, dependencies); } -sycl::event rotm(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, float *param, const std::vector &dependencies) { +sycl::event rotm(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, float* param, const std::vector& dependencies) { return blas_major::rotm(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotm(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, double *param, const std::vector &dependencies) { +sycl::event rotm(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, double* param, const std::vector& dependencies) { return blas_major::rotm(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, - const std::vector &dependencies) { +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, float* param, + const std::vector& dependencies) { return blas_major::rotmg(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, double* param, + const std::vector& dependencies) { return blas_major::rotmg(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, float alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, float alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, double alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, double alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::scal(queue, n, alpha, x, incx, dependencies); } -sycl::event swap(sycl::queue &queue, std::int64_t n, float *x, std::int64_t incx, float *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, float* x, std::int64_t incx, float* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::swap(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(sycl::queue &queue, std::int64_t n, double *x, std::int64_t incx, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, double* x, std::int64_t incx, double* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::swap(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::swap(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::swap(queue, n, x, incx, y, incy, dependencies); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { return blas_major::iamax(queue, n, x, incx, result, dependencies); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { return blas_major::iamax(queue, n, x, incx, result, dependencies); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return blas_major::iamax(queue, n, x, incx, result, dependencies); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return blas_major::iamax(queue, n, x, incx, result, dependencies); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { return blas_major::iamin(queue, n, x, incx, result, dependencies); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { return blas_major::iamin(queue, n, x, incx, result, dependencies); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return blas_major::iamin(queue, n, x, incx, result, dependencies); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return blas_major::iamin(queue, n, x, incx, result, dependencies); } diff --git a/src/blas/backends/mkl_common/mkl_level2.cxx b/src/blas/backends/mkl_common/mkl_level2.cxx index 83494be12..56fa591dc 100644 --- a/src/blas/backends/mkl_common/mkl_level2.cxx +++ b/src/blas/backends/mkl_common/mkl_level2.cxx @@ -19,844 +19,844 @@ // Buffer APIs -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, + std::int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, sycl::buffer& a, std::int64_t lda) { blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } -void her(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::her(queue, uplo, n, alpha, x, incx, a, lda); } -void her(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::her(queue, uplo, n, alpha, x, incx, a, lda); } -void her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda); } -void her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda); } -void hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy); } -void hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, - std::int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, + std::int64_t incx, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy); } -void hpr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { blas_major::hpr(queue, uplo, n, alpha, x, incx, a); } -void hpr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { blas_major::hpr(queue, uplo, n, alpha, x, incx, a); } -void hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a); } -void hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a); } -void sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { +void sbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { +void sbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void spmv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void spmv(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy); } -void spmv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void spmv(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy); } -void spr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a) { +void spr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a) { blas_major::spr(queue, uplo, n, alpha, x, incx, a); } -void spr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a) { +void spr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a) { blas_major::spr(queue, uplo, n, alpha, x, incx, a); } -void spr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a) { +void spr2(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a) { blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a); } -void spr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a) { +void spr2(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a) { blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a); } -void symv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +void symv(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } -void symv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy) { +void symv(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy) { blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy); } -void syr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda) { +void syr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a, std::int64_t lda) { blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda); } -void syr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda) { +void syr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a, std::int64_t lda) { blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda); } -void syr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +void syr2(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda); } -void syr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda) { +void syr2(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda) { blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda); } -void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, std::int64_t k, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx); } -void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx) { +void tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx); } -void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx) { +void tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx); } -void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx); } -void tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx); } -void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx) { +void tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx); } -void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx) { +void tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx); } -void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx); } -void tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, sycl::buffer, 1>& x, std::int64_t incx) { blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx); } -void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, +void trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, std::int64_t incx) { blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx); } -void trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx); } // USM APIs -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, double alpha, + const double* a, std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::gemv(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, float alpha, const float *a, std::int64_t lda, - const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, float alpha, const float* a, std::int64_t lda, + const float* x, std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, double alpha, const double *a, std::int64_t lda, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, double alpha, const double* a, std::int64_t lda, + const double* x, std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, +sycl::event gbmv(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::gbmv(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::ger(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::gerc(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::geru(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::hbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event hemv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event hemv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::hemv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event her(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return blas_major::her(queue, uplo, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return blas_major::her(queue, uplo, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event her2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::her2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *a, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* a, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::hpmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies) { return blas_major::hpr(queue, uplo, n, alpha, x, incx, a, dependencies); } -sycl::event hpr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies) { return blas_major::hpr(queue, uplo, n, alpha, x, incx, a, dependencies); } -sycl::event hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event hpr2(sycl::queue &queue, uplo uplo, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(sycl::queue& queue, uplo uplo, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return blas_major::hpr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event sbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, float alpha, + const float* a, std::int64_t lda, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { return blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event sbmv(sycl::queue &queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event sbmv(sycl::queue& queue, uplo uplo, std::int64_t n, std::int64_t k, double alpha, + const double* a, std::int64_t lda, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { return blas_major::sbmv(queue, uplo, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *a, - const float *x, std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event spmv(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* a, + const float* x, std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *a, - const double *x, std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event spmv(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* a, + const double* x, std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return blas_major::spmv(queue, uplo, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, const std::vector &dependencies) { +sycl::event spr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, const std::vector& dependencies) { return blas_major::spr(queue, uplo, n, alpha, x, incx, a, dependencies); } -sycl::event spr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, const std::vector &dependencies) { +sycl::event spr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, const std::vector& dependencies) { return blas_major::spr(queue, uplo, n, alpha, x, incx, a, dependencies); } -sycl::event spr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, - const std::vector &dependencies) { +sycl::event spr2(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, + const std::vector& dependencies) { return blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event spr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, - const std::vector &dependencies) { +sycl::event spr2(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* a, + const std::vector& dependencies) { return blas_major::spr2(queue, uplo, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event symv(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event symv(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event symv(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event symv(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { return blas_major::symv(queue, uplo, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event syr(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::syr(queue, uplo, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr2(sycl::queue &queue, uplo uplo, std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr2(sycl::queue& queue, uplo uplo, std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event syr2(sycl::queue &queue, uplo uplo, std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr2(sycl::queue& queue, uplo uplo, std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return blas_major::syr2(queue, uplo, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbmv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - std::int64_t k, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + std::int64_t k, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tbsv(queue, uplo, trans, diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpmv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event tpsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::tpsv(queue, uplo, trans, diag, n, a, x, incx, dependencies); } -sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::trmv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const float* a, std::int64_t lda, float* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const double* a, std::int64_t lda, double* x, std::int64_t incx, + const std::vector& dependencies) { return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(sycl::queue &queue, uplo uplo, transpose trans, diag diag, std::int64_t n, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, uplo uplo, transpose trans, diag diag, std::int64_t n, + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { return blas_major::trsv(queue, uplo, trans, diag, n, a, lda, x, incx, dependencies); } diff --git a/src/blas/backends/mkl_common/mkl_level3.cxx b/src/blas/backends/mkl_common/mkl_level3.cxx index d52c710f1..e67afc26d 100644 --- a/src/blas/backends/mkl_common/mkl_level3.cxx +++ b/src/blas/backends/mkl_common/mkl_level3.cxx @@ -19,501 +19,501 @@ // Buffer APIs -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::half beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::half alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::half beta, + sycl::buffer& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, float beta, - sycl::buffer &c, std::int64_t ldc) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, float beta, + sycl::buffer& c, std::int64_t ldc) { blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, double beta, - sycl::buffer &c, std::int64_t ldc) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, double beta, + sycl::buffer& c, std::int64_t ldc) { blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer, 1> &a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer, 1> &a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer, 1>& a, std::int64_t lda, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb) { +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb) { +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb); } // USM APIs -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, sycl::half beta, - sycl::half *c, std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, sycl::half beta, + sycl::half* c, std::int64_t ldc, const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, std::int64_t lda, - const sycl::half *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, std::int64_t lda, - const bfloat16 *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, std::int64_t lda, + const bfloat16* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::gemm(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::symm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::hemm(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, float beta, - float *c, std::int64_t ldc, const std::vector &dependencies) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, float beta, + float* c, std::int64_t ldc, const std::vector& dependencies) { return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, double beta, - double *c, std::int64_t ldc, const std::vector &dependencies) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, double beta, + double* c, std::int64_t ldc, const std::vector& dependencies) { return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return blas_major::syrk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const std::complex *a, std::int64_t lda, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const std::complex* a, std::int64_t lda, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const std::complex *a, std::int64_t lda, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const std::complex* a, std::int64_t lda, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::herk(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::syr2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, float beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, float beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, double beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return blas_major::her2k(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return blas_major::trmm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return blas_major::trsm(queue, left_right, upper_lower, transa, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } diff --git a/src/blas/backends/netlib/netlib_batch.cxx b/src/blas/backends/netlib/netlib_batch.cxx index 7a2839dd4..5af30b80f 100644 --- a/src/blas/backends/netlib/netlib_batch.cxx +++ b/src/blas/backends/netlib/netlib_batch.cxx @@ -19,8 +19,8 @@ // Buffer APIs -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); @@ -30,8 +30,8 @@ void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_ #endif } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); @@ -41,8 +41,8 @@ void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64 #endif } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); @@ -52,8 +52,8 @@ void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, #endif } -void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, - int64_t incx, int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, + int64_t incx, int64_t stridex, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); @@ -63,9 +63,9 @@ void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer #endif } -void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, - int64_t stridey, int64_t batch_size) { +void axpy_batch(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, + int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -74,8 +74,8 @@ void axpy_batch(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +void axpy_batch(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); @@ -85,9 +85,9 @@ void axpy_batch(sycl::queue &queue, int64_t n, double alpha, sycl::buffer alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); @@ -97,9 +97,9 @@ void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, int64_t stridex, - sycl::buffer, 1> &y, int64_t incy, int64_t stridey, +void axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, int64_t stridex, + sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); @@ -109,10 +109,10 @@ void axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, float beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, float beta, sycl::buffer& y, int64_t incy, + int64_t stride_y, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -121,11 +121,10 @@ void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, floa #endif } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, double beta, - sycl::buffer &y, int64_t incy, int64_t stride_y, - int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, double beta, + sycl::buffer& y, int64_t incy, int64_t stride_y, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -134,12 +133,11 @@ void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, doub #endif } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &x, int64_t incx, - int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, - int64_t batch_size) { +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, sycl::buffer, 1>& y, + int64_t incy, int64_t stride_y, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -148,11 +146,11 @@ void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, #endif } -void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &x, - int64_t incx, int64_t stride_x, std::complex beta, - sycl::buffer, 1> &y, int64_t incy, int64_t stride_y, +void gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& x, int64_t incx, + int64_t stride_x, std::complex beta, + sycl::buffer, 1>& y, int64_t incy, int64_t stride_y, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); @@ -162,10 +160,10 @@ void gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, #endif } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, sycl::buffer& x, + int64_t incx, int64_t stride_x, sycl::buffer& c, int64_t ldc, + int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -174,10 +172,10 @@ void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, #endif } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &x, int64_t incx, int64_t stride_x, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& x, int64_t incx, int64_t stride_x, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -186,10 +184,10 @@ void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, #endif } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); @@ -199,10 +197,10 @@ void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, #endif } -void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &x, int64_t incx, int64_t stride_x, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& x, int64_t incx, int64_t stride_x, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); @@ -212,11 +210,10 @@ void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -225,11 +222,10 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, double beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -238,12 +234,11 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, - int64_t ldb, int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, + int64_t stride_b, std::complex beta, sycl::buffer, 1>& c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -252,11 +247,11 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); @@ -266,10 +261,10 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); @@ -279,10 +274,10 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); @@ -292,10 +287,10 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); @@ -305,10 +300,10 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int64_t stride_a, sycl::buffer &b, int64_t ldb, int64_t stride_b, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, + float beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); @@ -318,9 +313,9 @@ void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); @@ -330,9 +325,9 @@ void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose #endif } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, int64_t stride_a, sycl::buffer &b, int64_t ldb, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, int64_t stride_a, sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); @@ -342,10 +337,10 @@ void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose #endif } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); @@ -355,10 +350,10 @@ void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose #endif } -void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, +void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, int64_t stride_a, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); @@ -368,10 +363,9 @@ void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose #endif } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, float beta, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -380,9 +374,9 @@ void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n #endif } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); @@ -392,11 +386,10 @@ void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n #endif } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, - int64_t batch_size) { +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, sycl::buffer, 1>& c, + int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -405,10 +398,10 @@ void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n #endif } -void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, +void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); @@ -418,9 +411,9 @@ void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n #endif } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -429,9 +422,9 @@ void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, f #endif } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, int64_t stride_a, - sycl::buffer &b, int64_t ldb, int64_t stride_b, int64_t batch_size) { +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, int64_t stride_a, + sycl::buffer& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -440,9 +433,9 @@ void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, d #endif } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stride_a, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); @@ -452,9 +445,9 @@ void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, int64_t stride_a, sycl::buffer, 1> &b, +void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, int64_t stride_a, sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); @@ -464,8 +457,8 @@ void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); @@ -475,8 +468,8 @@ void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, f #endif } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); @@ -486,8 +479,8 @@ void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, d #endif } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); @@ -497,8 +490,8 @@ void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); @@ -508,10 +501,10 @@ void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - float beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + float beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif @@ -520,10 +513,10 @@ void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64 #endif } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, int64_t stride_a, - double beta, sycl::buffer &b, int64_t ldb, int64_t stride_b, - sycl::buffer &c, int64_t ldc, int64_t stride_c, int64_t batch_size) { +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, int64_t stride_a, + double beta, sycl::buffer& b, int64_t ldb, int64_t stride_b, + sycl::buffer& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif @@ -532,11 +525,11 @@ void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64 #endif } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); @@ -546,11 +539,11 @@ void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64 #endif } -void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, +void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, int64_t ldb, int64_t stride_b, - sycl::buffer, 1> &c, int64_t ldc, int64_t stride_c, + sycl::buffer, 1>& b, int64_t ldb, int64_t stride_b, + sycl::buffer, 1>& c, int64_t ldc, int64_t stride_c, int64_t batch_size) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); @@ -562,9 +555,9 @@ void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64 // USM APIs -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t *incx, - float **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const float** x, int64_t* incx, float** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -573,9 +566,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t *n, const float **x, int64_t #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t *incx, - double **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const double** x, int64_t* incx, double** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -584,10 +577,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t *n, const double **x, int64_t #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, int64_t* incx, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -596,10 +588,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex **x, - int64_t *incx, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t* n, const std::complex** x, + int64_t* incx, std::complex** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -608,10 +599,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t *n, const std::complex &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const float* x, int64_t incx, + std::int64_t stridex, float* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -620,10 +610,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t n, const float *x, int64_t in #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t incx, - std::int64_t stridex, double *y, int64_t incy, std::int64_t stridey, - std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const double* x, int64_t incx, + std::int64_t stridex, double* y, int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -632,10 +621,10 @@ sycl::event copy_batch(sycl::queue &queue, int64_t n, const double *x, int64_t i #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, - int64_t incx, std::int64_t stridex, std::complex *y, int64_t incy, - std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -644,10 +633,10 @@ sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex #endif } -sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex *x, - int64_t incx, std::int64_t stridex, std::complex *y, - int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::int64_t stridex, std::complex* y, int64_t incy, + std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "copy_batch", "for column_major layout"); #endif @@ -656,9 +645,9 @@ sycl::event copy_batch(sycl::queue &queue, int64_t n, const std::complex #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float **x, - int64_t *incx, float **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, float* alpha, const float** x, int64_t* incx, + float** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -667,9 +656,9 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t *n, float *alpha, const float #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const double **x, - int64_t *incx, double **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, double* alpha, const double** x, + int64_t* incx, double** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -678,10 +667,10 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t *n, double *alpha, const doub #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -690,10 +679,10 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alph #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alpha, - const std::complex **x, int64_t *incx, std::complex **y, - int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t* n, std::complex* alpha, + const std::complex** x, int64_t* incx, std::complex** y, + int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -702,9 +691,9 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t *n, std::complex *alp #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float *x, - int64_t incx, int64_t stridex, float *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + int64_t stridex, float* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -713,9 +702,9 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t n, float alpha, const float * #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double *x, - int64_t incx, int64_t stridex, double *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + int64_t stridex, double* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -724,10 +713,10 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t n, double alpha, const double #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -736,10 +725,10 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, int64_t stridex, - std::complex *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, int64_t stridex, + std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpy_batch", "for column_major layout"); #endif @@ -748,11 +737,10 @@ sycl::event axpy_batch(sycl::queue &queue, int64_t n, std::complex alpha #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, int64_t stride_a, - const float *x, int64_t incx, int64_t stride_x, float beta, float *y, - int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float beta, float* y, int64_t incy, int64_t stride_y, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -761,11 +749,11 @@ sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, int64_t stride_a, - const double *x, int64_t incx, int64_t stride_x, double beta, double *y, - int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, const double* x, + int64_t incx, int64_t stride_x, double beta, double* y, int64_t incy, + int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -774,12 +762,12 @@ sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, - int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, + int64_t incy, int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -788,12 +776,12 @@ sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, const std::complex *x, int64_t incx, - int64_t stride_x, std::complex beta, std::complex *y, - int64_t incy, int64_t stride_y, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose transa, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, const std::complex* x, int64_t incx, + int64_t stride_x, std::complex beta, std::complex* y, + int64_t incy, int64_t stride_y, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -802,11 +790,10 @@ sycl::event gemv_batch(sycl::queue &queue, transpose transa, int64_t m, int64_t #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - float *alpha, const float **a, int64_t *lda, const float **x, - int64_t *incx, float *beta, float **y, int64_t *incy, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, float* alpha, + const float** a, int64_t* lda, const float** x, int64_t* incx, float* beta, + float** y, int64_t* incy, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -815,11 +802,10 @@ sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_ #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - double *alpha, const double **a, int64_t *lda, const double **x, - int64_t *incx, double *beta, double **y, int64_t *incy, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, double* alpha, + const double** a, int64_t* lda, const double** x, int64_t* incx, + double* beta, double** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -828,11 +814,11 @@ sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_ #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex *beta, - std::complex **y, int64_t *incy, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -841,12 +827,11 @@ sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_ #endif } -sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, const std::complex **x, int64_t *incx, - std::complex *beta, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, transpose* transa, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, int64_t* lda, + const std::complex** x, int64_t* incx, std::complex* beta, + std::complex** y, int64_t* incy, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemv_batch", "for column_major layout"); #endif @@ -855,10 +840,10 @@ sycl::event gemv_batch(sycl::queue &queue, transpose *transa, int64_t *m, int64_ #endif } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const float *a, int64_t lda, int64_t stride_a, const float *x, - int64_t incx, int64_t stride_x, float *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const float* a, + int64_t lda, int64_t stride_a, const float* x, int64_t incx, + int64_t stride_x, float* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -867,10 +852,10 @@ sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n #endif } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const double *a, int64_t lda, int64_t stride_a, const double *x, - int64_t incx, int64_t stride_x, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, const double* a, + int64_t lda, int64_t stride_a, const double* x, int64_t incx, + int64_t stride_x, double* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -879,11 +864,11 @@ sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n #endif } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -892,11 +877,11 @@ sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n #endif } -sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *x, int64_t incx, int64_t stride_x, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, + const std::complex* a, int64_t lda, int64_t stride_a, + const std::complex* x, int64_t incx, int64_t stride_x, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -905,10 +890,10 @@ sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n #endif } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const float **a, int64_t *lda, const float **x, int64_t *incx, float **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const float** a, int64_t* lda, const float** x, int64_t* incx, float** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -917,10 +902,10 @@ sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t #endif } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const double **a, int64_t *lda, const double **x, int64_t *incx, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const double** a, int64_t* lda, const double** x, int64_t* incx, double** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -929,11 +914,10 @@ sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t #endif } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -942,11 +926,10 @@ sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t #endif } -sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, - const std::complex **a, int64_t *lda, - const std::complex **x, int64_t *incx, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, + const std::complex** a, int64_t* lda, const std::complex** x, + int64_t* incx, std::complex** c, int64_t* ldc, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "dgmm_batch", "for column_major layout"); #endif @@ -955,11 +938,11 @@ sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const float **a, int64_t *lda, - const float **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const float** a, int64_t* lda, + const float** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -968,11 +951,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, double *alpha, const double **a, int64_t *lda, - const double **b, int64_t *ldb, double *beta, double **c, int64_t *ldc, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, double* alpha, const double** a, int64_t* lda, + const double** b, int64_t* ldb, double* beta, double** c, int64_t* ldc, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -981,12 +964,12 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, - const std::complex **b, int64_t *ldb, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -995,12 +978,12 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, std::complex *alpha, - const std::complex **a, int64_t *lda, - const std::complex **b, int64_t *ldb, std::complex *beta, - std::complex **c, int64_t *ldc, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, std::complex* alpha, + const std::complex** a, int64_t* lda, const std::complex** b, + int64_t* ldb, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1009,11 +992,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, sycl::half *alpha, const sycl::half **a, - int64_t *lda, const sycl::half **b, int64_t *ldb, sycl::half *beta, - sycl::half **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, sycl::half* alpha, const sycl::half** a, + int64_t* lda, const sycl::half** b, int64_t* ldb, sycl::half* beta, + sycl::half** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1022,11 +1005,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const sycl::half **a, int64_t *lda, - const sycl::half **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const sycl::half** a, int64_t* lda, + const sycl::half** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1035,11 +1018,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, float **c, int64_t *ldc, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, float** c, int64_t* ldc, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1048,11 +1031,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, - int64_t *n, int64_t *k, float *alpha, const std::int8_t **a, int64_t *lda, - const std::int8_t **b, int64_t *ldb, float *beta, std::int32_t **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, + int64_t* n, int64_t* k, float* alpha, const std::int8_t** a, int64_t* lda, + const std::int8_t** b, int64_t* ldb, float* beta, std::int32_t** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1061,11 +1044,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, - int64_t stride_a, const float *b, int64_t ldb, int64_t stride_b, - float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, int64_t stride_a, + const float* b, int64_t ldb, int64_t stride_b, float beta, float* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1074,11 +1057,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, - int64_t stride_a, const double *b, int64_t ldb, int64_t stride_b, - double beta, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, int64_t stride_a, + const double* b, int64_t ldb, int64_t stride_b, double beta, double* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1087,13 +1070,12 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex beta, std::complex *c, int64_t ldc, - int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1102,13 +1084,12 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, int64_t stride_a, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex beta, std::complex *c, int64_t ldc, - int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, int64_t stride_a, const std::complex* b, int64_t ldb, + int64_t stride_b, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1117,11 +1098,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, - int64_t stride_a, const sycl::half *b, int64_t ldb, int64_t stride_b, - sycl::half beta, sycl::half *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, + int64_t stride_a, const sycl::half* b, int64_t ldb, int64_t stride_b, + sycl::half beta, sycl::half* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1130,11 +1111,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const sycl::half *a, int64_t lda, int64_t stride_a, - const sycl::half *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, int64_t stride_a, + const sycl::half* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1143,11 +1124,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, float *c, +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1156,11 +1137,11 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const std::int8_t *a, int64_t lda, int64_t stride_a, - const std::int8_t *b, int64_t ldb, int64_t stride_b, float beta, - std::int32_t *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const std::int8_t* a, int64_t lda, int64_t stride_a, + const std::int8_t* b, int64_t ldb, int64_t stride_b, float beta, + std::int32_t* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_batch", "for column_major layout"); #endif @@ -1169,11 +1150,10 @@ sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, i #endif } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, - int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1182,11 +1162,10 @@ sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, - int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1195,11 +1174,11 @@ sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1208,11 +1187,11 @@ sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, int64_t stride_a, + std::complex* b, int64_t ldb, int64_t stride_b, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1221,11 +1200,10 @@ sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, float *alpha, - const float **a, int64_t *lda, float **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, float* alpha, const float** a, + int64_t* lda, float** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1234,11 +1212,10 @@ sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, double *alpha, - const double **a, int64_t *lda, double **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, double* alpha, const double** a, + int64_t* lda, double** b, int64_t* ldb, int64_t group_count, + int64_t* groupsize, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1247,11 +1224,11 @@ sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, int64_t *lda, - std::complex **b, int64_t *ldb, int64_t group_count, - int64_t *groupsize, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1260,12 +1237,11 @@ sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, #endif } -sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, transpose* trans, + diag* unit_diag, int64_t* m, int64_t* n, std::complex* alpha, + const std::complex** a, int64_t* lda, std::complex** b, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "trsm_batch", "for column_major layout"); #endif @@ -1274,10 +1250,10 @@ sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, float *alpha, const float **a, int64_t *lda, float *beta, - float **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, float* alpha, const float** a, int64_t* lda, float* beta, + float** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1286,10 +1262,10 @@ sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, double *alpha, const double **a, int64_t *lda, double *beta, - double **c, int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, double* alpha, const double** a, int64_t* lda, double* beta, + double** c, int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1298,11 +1274,11 @@ sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1311,11 +1287,11 @@ sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, - int64_t *k, std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex *beta, std::complex **c, - int64_t *ldc, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, + int64_t* k, std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex* beta, std::complex** c, + int64_t* ldc, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1324,10 +1300,10 @@ sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, float *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, int64_t stride_a, float beta, + float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1336,10 +1312,10 @@ sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, in #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, double *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, int64_t stride_a, double beta, + double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1348,11 +1324,11 @@ sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, in #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1361,11 +1337,11 @@ sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, in #endif } -sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, int64_t stride_a, std::complex beta, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex beta, std::complex* c, + int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "syrk_batch", "for column_major layout"); #endif @@ -1374,10 +1350,10 @@ sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, in #endif } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, int64_t stride_a, float *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, int64_t stride_a, float* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -1386,10 +1362,10 @@ sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, int64_t stride_a, double *b, int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, int64_t stride_a, double* b, int64_t ldb, int64_t stride_b, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -1398,10 +1374,10 @@ sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -1410,10 +1386,10 @@ sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stride_a, std::complex *b, int64_t ldb, int64_t stride_b, - int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stride_a, std::complex* b, int64_t ldb, int64_t stride_b, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy_batch", "for column_major layout"); #endif @@ -1422,9 +1398,9 @@ sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); #endif @@ -1433,9 +1409,9 @@ sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); #endif @@ -1444,10 +1420,10 @@ sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); #endif @@ -1456,10 +1432,10 @@ sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); #endif @@ -1468,11 +1444,11 @@ sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64 #endif } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, float alpha, const float *a, int64_t lda, int64_t stride_a, - float beta, const float *b, int64_t ldb, int64_t stride_b, float *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, float alpha, const float* a, int64_t lda, int64_t stride_a, + float beta, const float* b, int64_t ldb, int64_t stride_b, float* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif @@ -1481,11 +1457,11 @@ sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb #endif } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, double alpha, const double *a, int64_t lda, int64_t stride_a, - double beta, const double *b, int64_t ldb, int64_t stride_b, double *c, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, double alpha, const double* a, int64_t lda, int64_t stride_a, + double beta, const double* b, int64_t ldb, int64_t stride_b, double* c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif @@ -1494,12 +1470,12 @@ sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb #endif } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, int64_t batch_size, - const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, int64_t batch_size, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif @@ -1508,12 +1484,12 @@ sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb #endif } -sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, std::complex alpha, const std::complex *a, +sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, std::complex alpha, const std::complex* a, int64_t lda, int64_t stride_a, std::complex beta, - const std::complex *b, int64_t ldb, int64_t stride_b, - std::complex *c, int64_t ldc, int64_t stride_c, - int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, int64_t ldb, int64_t stride_b, + std::complex* c, int64_t ldc, int64_t stride_c, + int64_t batch_size, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd_batch", "for column_major layout"); #endif diff --git a/src/blas/backends/netlib/netlib_common.hpp b/src/blas/backends/netlib/netlib_common.hpp index 3a69c70f8..18c08221d 100644 --- a/src/blas/backends/netlib/netlib_common.hpp +++ b/src/blas/backends/netlib/netlib_common.hpp @@ -79,19 +79,19 @@ inline CBLAS_OFFSET convert_to_cblas_offset(offset offsetc) { // host_task automatically uses run_on_host_intel if it is supported by the // compiler. Otherwise, it falls back to single_task. template -static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.host_task(f)) { +static inline auto host_task_internal(H& cgh, F f, int) -> decltype(cgh.host_task(f)) { return cgh.host_task(f); } template -static inline void host_task_internal(H &cgh, F f, long) { +static inline void host_task_internal(H& cgh, F f, long) { #ifndef __SYCL_DEVICE_ONLY__ cgh.template single_task(f); #endif } template -static inline void host_task(H &cgh, F f) { +static inline void host_task(H& cgh, F f) { (void)host_task_internal(cgh, f, 0); } diff --git a/src/blas/backends/netlib/netlib_extensions.cxx b/src/blas/backends/netlib/netlib_extensions.cxx index 8e94cb880..d0c13ebbd 100644 --- a/src/blas/backends/netlib/netlib_extensions.cxx +++ b/src/blas/backends/netlib/netlib_extensions.cxx @@ -19,11 +19,10 @@ // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, - float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -32,11 +31,10 @@ void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset of #endif } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, - float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -45,11 +43,10 @@ void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset of #endif } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, - float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -58,11 +55,10 @@ void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset of #endif } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, sycl::buffer &a, - int64_t lda, uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, - float beta, sycl::buffer &c, int64_t ldc, - sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -71,9 +67,9 @@ void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset of #endif } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); @@ -83,9 +79,9 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra #endif } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); @@ -95,10 +91,10 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra #endif } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -107,10 +103,10 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra #endif } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -119,8 +115,8 @@ void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose tra #endif } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -129,8 +125,8 @@ void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float a #endif } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -139,9 +135,9 @@ void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double #endif } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -150,9 +146,9 @@ void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co #endif } -void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -161,9 +157,9 @@ void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co #endif } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -172,9 +168,9 @@ void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float #endif } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -183,9 +179,9 @@ void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double #endif } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -194,9 +190,9 @@ void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::c #endif } -void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, int64_t ldb, +void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, int64_t ldb, std::int64_t strideb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); @@ -206,8 +202,8 @@ void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -216,8 +212,8 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float a #endif } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -226,8 +222,8 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double #endif } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -236,8 +232,8 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co #endif } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -246,9 +242,9 @@ void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::co #endif } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, sycl::buffer &a, int64_t lda, float beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, sycl::buffer& a, int64_t lda, float beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -257,9 +253,9 @@ void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, #endif } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, sycl::buffer &a, int64_t lda, double beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, sycl::buffer& a, int64_t lda, double beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -268,10 +264,10 @@ void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, #endif } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -280,10 +276,10 @@ void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, #endif } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &b, int64_t ldb, - sycl::buffer, 1> &c, int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& b, int64_t ldb, + sycl::buffer, 1>& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -294,11 +290,11 @@ void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - const int8_t *a, int64_t lda, int8_t ao, const int8_t *b, int64_t ldb, - int8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -307,11 +303,11 @@ sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, #endif } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - const int8_t *a, int64_t lda, int8_t ao, const uint8_t *b, int64_t ldb, - uint8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -320,11 +316,11 @@ sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, #endif } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - const uint8_t *a, int64_t lda, uint8_t ao, const int8_t *b, int64_t ldb, - int8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -333,11 +329,11 @@ sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, #endif } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, - offset offsetc, int64_t m, int64_t n, int64_t k, float alpha, - const uint8_t *a, int64_t lda, uint8_t ao, const uint8_t *b, int64_t ldb, - uint8_t bo, float beta, int32_t *c, int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm_bias", "for column_major layout"); #endif @@ -346,10 +342,10 @@ sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, #endif } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, - const float *b, int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -358,10 +354,10 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp #endif } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, - const double *b, int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -370,11 +366,11 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp #endif } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -383,11 +379,11 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp #endif } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *b, - int64_t ldb, std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemmt", "for column_major layout"); #endif @@ -396,9 +392,9 @@ sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transp #endif } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, float *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, float* b, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -407,9 +403,9 @@ sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, double *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, double* b, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -418,10 +414,10 @@ sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -430,10 +426,10 @@ sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy", "for column_major layout"); #endif @@ -442,9 +438,9 @@ sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, std::int64_t stridea, float *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, std::int64_t stridea, float* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -453,9 +449,9 @@ sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, std::int64_t stridea, double *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, std::int64_t stridea, double* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -464,10 +460,10 @@ sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -476,10 +472,10 @@ sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::int64_t stridea, std::complex *b, int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::int64_t stridea, std::complex* b, int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatcopy2", "for column_major layout"); #endif @@ -488,9 +484,9 @@ sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -499,9 +495,9 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -510,9 +506,9 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -521,9 +517,9 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "imatcopy", "for column_major layout"); #endif @@ -532,10 +528,10 @@ sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, #endif } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, float beta, const float *b, - int64_t ldb, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, float beta, const float* b, + int64_t ldb, float* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -544,10 +540,10 @@ sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int6 #endif } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, double beta, const double *b, - int64_t ldb, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, double beta, const double* b, + int64_t ldb, double* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -556,11 +552,11 @@ sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int6 #endif } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -569,11 +565,11 @@ sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int6 #endif } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, const std::complex *b, int64_t ldb, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, const std::complex* b, int64_t ldb, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "omatadd", "for column_major layout"); #endif @@ -581,5 +577,3 @@ sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int6 throw unimplemented("blas", "omatadd", "for row_major layout"); #endif } - - diff --git a/src/blas/backends/netlib/netlib_level1.cpp b/src/blas/backends/netlib/netlib_level1.cpp index 59830db81..284adce75 100644 --- a/src/blas/backends/netlib/netlib_level1.cpp +++ b/src/blas/backends/netlib/netlib_level1.cpp @@ -43,7 +43,7 @@ inline double abs_val(std::complex val) { return std::abs(val.real()) + std::abs(val.imag()); } -int cblas_isamin(int n, const float *x, int incx) { +int cblas_isamin(int n, const float* x, int incx) { if (n < 1 || incx < 1) { return 0; } @@ -65,7 +65,7 @@ int cblas_isamin(int n, const float *x, int incx) { return min_idx; } -int cblas_idamin(int n, const double *x, int incx) { +int cblas_idamin(int n, const double* x, int incx) { if (n < 1 || incx < 1) { return 0; } @@ -87,7 +87,7 @@ int cblas_idamin(int n, const double *x, int incx) { return min_idx; } -int cblas_icamin(int n, const std::complex *x, int incx) { +int cblas_icamin(int n, const std::complex* x, int incx) { if (n < 1 || incx < 1) { return 0; } @@ -109,7 +109,7 @@ int cblas_icamin(int n, const std::complex *x, int incx) { return min_idx; } -int cblas_izamin(int n, const std::complex *x, int incx) { +int cblas_izamin(int n, const std::complex* x, int incx) { if (n < 1 || incx < 1) { return 0; } @@ -131,7 +131,7 @@ int cblas_izamin(int n, const std::complex *x, int incx) { return min_idx; } -void cblas_csrot(const int n, std::complex *cx, const int incx, std::complex *cy, +void cblas_csrot(const int n, std::complex* cx, const int incx, std::complex* cy, const int incy, const float c, const float s) { if (n < 1) return; @@ -158,7 +158,7 @@ void cblas_csrot(const int n, std::complex *cx, const int incx, std::comp } } -void cblas_zdrot(const int n, std::complex *zx, const int incx, std::complex *zy, +void cblas_zdrot(const int n, std::complex* zx, const int incx, std::complex* zy, const int incy, const double c, const double s) { if (n < 1) return; @@ -185,8 +185,8 @@ void cblas_zdrot(const int n, std::complex *zx, const int incx, std::com } } -void cblas_crotg(std::complex *ca, const std::complex *cb, float *c, - std::complex *s) { +void cblas_crotg(std::complex* ca, const std::complex* cb, float* c, + std::complex* s) { if (std::abs(ca[0]) == 0) { c[0] = 0.0; s[0] = std::complex(1.0, 0.0); @@ -203,8 +203,8 @@ void cblas_crotg(std::complex *ca, const std::complex *cb, float * } } -void cblas_zrotg(std::complex *ca, const std::complex *cb, double *c, - std::complex *s) { +void cblas_zrotg(std::complex* ca, const std::complex* cb, double* c, + std::complex* s) { if (std::abs(ca[0]) == 0) { c[0] = 0.0; s[0] = std::complex(1.0, 0.0); diff --git a/src/blas/backends/netlib/netlib_level1.cxx b/src/blas/backends/netlib/netlib_level1.cxx index 9f953dc5b..5514a86c1 100644 --- a/src/blas/backends/netlib/netlib_level1.cxx +++ b/src/blas/backends/netlib/netlib_level1.cxx @@ -19,9 +19,9 @@ // Buffer APIs -void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -31,9 +31,9 @@ void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -43,9 +43,9 @@ void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -55,9 +55,9 @@ void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void asum(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -67,9 +67,9 @@ void asum(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void axpy(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void axpy(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -79,9 +79,9 @@ void axpy(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, }); } -void axpy(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void axpy(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -91,34 +91,34 @@ void axpy(sycl::queue &queue, int64_t n, double alpha, sycl::buffer & }); } -void axpy(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { - ::cblas_caxpy((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR, + ::cblas_caxpy((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void axpy(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void axpy(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { - ::cblas_zaxpy((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR, + ::cblas_zaxpy((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -127,8 +127,8 @@ void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x #endif } -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -137,9 +137,9 @@ void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer #endif } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -148,9 +148,9 @@ void axpby(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -159,9 +159,9 @@ void axpby(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -171,9 +171,9 @@ void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -183,9 +183,9 @@ void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -195,9 +195,9 @@ void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void copy(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -207,9 +207,9 @@ void copy(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -221,9 +221,9 @@ void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, }); } -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -235,9 +235,9 @@ void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -249,10 +249,10 @@ void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, }); } -void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { - queue.submit([&](sycl::handler &cgh) { +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -264,10 +264,10 @@ void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { - queue.submit([&](sycl::handler &cgh) { +void dotc(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -279,10 +279,10 @@ void dotc(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { - queue.submit([&](sycl::handler &cgh) { +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -294,10 +294,10 @@ void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &result) { - queue.submit([&](sycl::handler &cgh) { +void dotu(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -309,9 +309,9 @@ void dotu(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -320,9 +320,9 @@ void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.template get_access(cgh); auto accessor_result = result.template get_access(cgh); host_task(cgh, [=]() { @@ -331,9 +331,9 @@ void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t in }); } -void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -342,9 +342,9 @@ void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamin(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -353,9 +353,9 @@ void iamin(sycl::queue &queue, int64_t n, sycl::buffer, 1> }); } -void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -364,9 +364,9 @@ void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -375,9 +375,9 @@ void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t in }); } -void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -386,9 +386,9 @@ void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void iamax(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -397,9 +397,9 @@ void iamax(sycl::queue &queue, int64_t n, sycl::buffer, 1> }); } -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.template get_access(cgh); auto accessor_result = result.template get_access(cgh); host_task(cgh, [=]() { @@ -409,9 +409,9 @@ void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -421,9 +421,9 @@ void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -433,9 +433,9 @@ void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void nrm2(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_result = result.get_access(cgh); host_task(cgh, [=]() { @@ -445,9 +445,9 @@ void nrm2(sycl::queue &queue, int64_t n, sycl::buffer, 1> & }); } -void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, float c, float s) { - queue.submit([&](sycl::handler &cgh) { +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, float c, float s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -457,9 +457,9 @@ void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, }); } -void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, double c, double s) { - queue.submit([&](sycl::handler &cgh) { +void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, double c, double s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -469,9 +469,9 @@ void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, float c, float s) { - queue.submit([&](sycl::handler &cgh) { +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, float c, float s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -482,9 +482,9 @@ void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, }); } -void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, double c, double s) { - queue.submit([&](sycl::handler &cgh) { +void rot(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, double c, double s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -495,9 +495,9 @@ void rot(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { - queue.submit([&](sycl::handler &cgh) { +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -509,9 +509,9 @@ void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer }); } -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { - queue.submit([&](sycl::handler &cgh) { +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -523,10 +523,10 @@ void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { - queue.submit([&](sycl::handler &cgh) { +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -538,10 +538,10 @@ void rotg(sycl::queue &queue, sycl::buffer, 1> &a, }); } -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { - queue.submit([&](sycl::handler &cgh) { +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -553,9 +553,9 @@ void rotg(sycl::queue &queue, sycl::buffer, 1> &a, }); } -void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { - queue.submit([&](sycl::handler &cgh) { +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_param = param.get_access(cgh); @@ -566,9 +566,9 @@ void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { - queue.submit([&](sycl::handler &cgh) { +void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_param = param.get_access(cgh); @@ -579,9 +579,9 @@ void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, float y1, sycl::buffer ¶m) { - queue.submit([&](sycl::handler &cgh) { +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, float y1, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { auto accessor_d1 = d1.get_access(cgh); auto accessor_d2 = d2.get_access(cgh); auto accessor_x1 = x1.get_access(cgh); @@ -593,9 +593,9 @@ void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, double y1, sycl::buffer ¶m) { - queue.submit([&](sycl::handler &cgh) { +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, double y1, sycl::buffer& param) { + queue.submit([&](sycl::handler& cgh) { auto accessor_d1 = d1.get_access(cgh); auto accessor_d2 = d2.get_access(cgh); auto accessor_x1 = x1.get_access(cgh); @@ -607,8 +607,8 @@ void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { ::cblas_sscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR, @@ -617,8 +617,8 @@ void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, }); } -void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { ::cblas_dscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR, @@ -627,20 +627,20 @@ void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer & }); } -void scal(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { - ::cblas_cscal((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR, + ::cblas_cscal((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); }); }); } -void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, int64_t n, float alpha, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { ::cblas_csscal((const int)n, (const float)alpha, accessor_x.GET_MULTI_PTR, @@ -649,20 +649,20 @@ void scal(sycl::queue &queue, int64_t n, float alpha, sycl::buffer alpha, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void scal(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { - ::cblas_zscal((const int)n, (const void *)&alpha, accessor_x.GET_MULTI_PTR, + ::cblas_zscal((const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)std::abs(incx)); }); }); } -void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer, 1> &x, +void scal(sycl::queue& queue, int64_t n, double alpha, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { ::cblas_zdscal((const int)n, (const double)alpha, accessor_x.GET_MULTI_PTR, @@ -671,9 +671,9 @@ void scal(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { - queue.submit([&](sycl::handler &cgh) { +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_result = result.get_access(cgh); @@ -685,9 +685,9 @@ void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, }); } -void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -697,9 +697,9 @@ void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx }); } -void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -709,9 +709,9 @@ void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t inc }); } -void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -721,9 +721,9 @@ void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x }); } -void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void swap(sycl::queue& queue, int64_t n, sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { @@ -735,9 +735,9 @@ void swap(sycl::queue &queue, int64_t n, sycl::buffer, 1> & // USM APIs -sycl::event asum(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event asum(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -748,9 +748,9 @@ sycl::event asum(sycl::queue &queue, int64_t n, const float *x, int64_t incx, fl return done; } -sycl::event asum(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event asum(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -761,9 +761,9 @@ sycl::event asum(sycl::queue &queue, int64_t n, const double *x, int64_t incx, d return done; } -sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - float *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -774,9 +774,9 @@ sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, in return done; } -sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - double *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event asum(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -787,9 +787,9 @@ sycl::event asum(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event axpy(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, float *y, - int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event axpy(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -801,9 +801,9 @@ sycl::event axpy(sycl::queue &queue, int64_t n, float alpha, const float *x, int return done; } -sycl::event axpy(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double *y, int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event axpy(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -816,41 +816,41 @@ sycl::event axpy(sycl::queue &queue, int64_t n, double alpha, const double *x, i return done; } -sycl::event axpy(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_caxpy((const int)n, (const void *)&alpha, x, (const int)incx, y, + ::cblas_caxpy((const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy); }); }); return done; } -sycl::event axpy(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event axpy(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_zaxpy((const int)n, (const void *)&alpha, x, (const int)incx, y, + ::cblas_zaxpy((const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy); }); }); return done; } -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - float beta, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -859,9 +859,9 @@ sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, in #endif } -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -870,10 +870,10 @@ sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, #endif } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -882,10 +882,10 @@ sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "axpby", "for column_major layout"); #endif @@ -894,9 +894,9 @@ sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, #endif } -sycl::event copy(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *y, - int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event copy(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -907,9 +907,9 @@ sycl::event copy(sycl::queue &queue, int64_t n, const float *x, int64_t incx, fl return done; } -sycl::event copy(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *y, - int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event copy(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -920,10 +920,10 @@ sycl::event copy(sycl::queue &queue, int64_t n, const double *x, int64_t incx, d return done; } -sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -934,10 +934,10 @@ sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, in return done; } -sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event copy(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -948,9 +948,9 @@ sycl::event copy(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, float *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -962,9 +962,9 @@ sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, con return done; } -sycl::event dot(sycl::queue &queue, int64_t n, const double *x, int64_t incx, const double *y, - int64_t incy, double *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dot(sycl::queue& queue, int64_t n, const double* x, int64_t incx, const double* y, + int64_t incy, double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -976,9 +976,9 @@ sycl::event dot(sycl::queue &queue, int64_t n, const double *x, int64_t incx, co return done; } -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -990,10 +990,10 @@ sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, con return done; } -sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1005,10 +1005,10 @@ sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, in return done; } -sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dotc(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1020,10 +1020,10 @@ sycl::event dotc(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dotu(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1035,10 +1035,10 @@ sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, in return done; } -sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - const std::complex *y, int64_t incy, std::complex *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event dotu(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + const std::complex* y, int64_t incy, std::complex* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1050,9 +1050,9 @@ sycl::event dotu(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event iamin(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamin(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1063,9 +1063,9 @@ sycl::event iamin(sycl::queue &queue, int64_t n, const float *x, int64_t incx, i return done; } -sycl::event iamin(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamin(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1076,9 +1076,9 @@ sycl::event iamin(sycl::queue &queue, int64_t n, const double *x, int64_t incx, return done; } -sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1089,9 +1089,9 @@ sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamin(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1102,9 +1102,9 @@ sycl::event iamin(sycl::queue &queue, int64_t n, const std::complex *x, return done; } -sycl::event iamax(sycl::queue &queue, int64_t n, const float *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamax(sycl::queue& queue, int64_t n, const float* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1115,9 +1115,9 @@ sycl::event iamax(sycl::queue &queue, int64_t n, const float *x, int64_t incx, i return done; } -sycl::event iamax(sycl::queue &queue, int64_t n, const double *x, int64_t incx, int64_t *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamax(sycl::queue& queue, int64_t n, const double* x, int64_t incx, int64_t* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1128,9 +1128,9 @@ sycl::event iamax(sycl::queue &queue, int64_t n, const double *x, int64_t incx, return done; } -sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1141,9 +1141,9 @@ sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - int64_t *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event iamax(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + int64_t* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1154,9 +1154,9 @@ sycl::event iamax(sycl::queue &queue, int64_t n, const std::complex *x, return done; } -sycl::event nrm2(sycl::queue &queue, int64_t n, const float *x, int64_t incx, float *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event nrm2(sycl::queue& queue, int64_t n, const float* x, int64_t incx, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1167,9 +1167,9 @@ sycl::event nrm2(sycl::queue &queue, int64_t n, const float *x, int64_t incx, fl return done; } -sycl::event nrm2(sycl::queue &queue, int64_t n, const double *x, int64_t incx, double *result, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event nrm2(sycl::queue& queue, int64_t n, const double* x, int64_t incx, double* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1180,9 +1180,9 @@ sycl::event nrm2(sycl::queue &queue, int64_t n, const double *x, int64_t incx, d return done; } -sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - float *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + float* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1193,9 +1193,9 @@ sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, in return done; } -sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, int64_t incx, - double *result, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event nrm2(sycl::queue& queue, int64_t n, const std::complex* x, int64_t incx, + double* result, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1206,9 +1206,9 @@ sycl::event nrm2(sycl::queue &queue, int64_t n, const std::complex *x, i return done; } -sycl::event rot(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy, - float c, float s, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rot(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float c, float s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1221,9 +1221,9 @@ sycl::event rot(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, return done; } -sycl::event rot(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - double c, double s, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rot(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double c, double s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1236,10 +1236,10 @@ sycl::event rot(sycl::queue &queue, int64_t n, double *x, int64_t incx, double * return done; } -sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, float c, float s, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, float c, float s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1252,10 +1252,10 @@ sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t i return done; } -sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, double c, double s, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rot(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, double c, double s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1268,9 +1268,9 @@ sycl::event rot(sycl::queue &queue, int64_t n, std::complex *x, int64_t return done; } -sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotg(sycl::queue& queue, float* a, float* b, float* c, float* s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1280,9 +1280,9 @@ sycl::event rotg(sycl::queue &queue, float *a, float *b, float *c, float *s, return done; } -sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotg(sycl::queue& queue, double* a, double* b, double* c, double* s, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1292,9 +1292,9 @@ sycl::event rotg(sycl::queue &queue, double *a, double *b, double *c, double *s, return done; } -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, float *c, - std::complex *s, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, float* c, + std::complex* s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1304,9 +1304,9 @@ sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex return done; } -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, double *c, - std::complex *s, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, double* c, + std::complex* s, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1316,9 +1316,9 @@ sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotm(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + float* param, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1330,9 +1330,9 @@ sycl::event rotm(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y return done; } -sycl::event rotm(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - double *param, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotm(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + double* param, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1344,9 +1344,9 @@ sycl::event rotm(sycl::queue &queue, int64_t n, double *x, int64_t incx, double return done; } -sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, float *param, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotmg(sycl::queue& queue, float* d1, float* d2, float* x1, float y1, float* param, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1357,9 +1357,9 @@ sycl::event rotmg(sycl::queue &queue, float *d1, float *d2, float *x1, float y1, return done; } -sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double y1, double *param, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event rotmg(sycl::queue& queue, double* d1, double* d2, double* x1, double y1, double* param, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1370,9 +1370,9 @@ sycl::event rotmg(sycl::queue &queue, double *d1, double *d2, double *x1, double return done; } -sycl::event scal(sycl::queue &queue, int64_t n, float alpha, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1384,9 +1384,9 @@ sycl::event scal(sycl::queue &queue, int64_t n, float alpha, float *x, int64_t i return done; } -sycl::event scal(sycl::queue &queue, int64_t n, double alpha, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1398,23 +1398,23 @@ sycl::event scal(sycl::queue &queue, int64_t n, double alpha, double *x, int64_t return done; } -sycl::event scal(sycl::queue &queue, int64_t n, std::complex alpha, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_cscal((const int)n, (const void *)&alpha, x, (const int)std::abs(incx)); + ::cblas_cscal((const int)n, (const void*)&alpha, x, (const int)std::abs(incx)); }); }); return done; } -sycl::event scal(sycl::queue &queue, int64_t n, float alpha, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, float alpha, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1426,23 +1426,23 @@ sycl::event scal(sycl::queue &queue, int64_t n, float alpha, std::complex return done; } -sycl::event scal(sycl::queue &queue, int64_t n, std::complex alpha, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, std::complex alpha, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_zscal((const int)n, (const void *)&alpha, x, (const int)std::abs(incx)); + ::cblas_zscal((const int)n, (const void*)&alpha, x, (const int)std::abs(incx)); }); }); return done; } -sycl::event scal(sycl::queue &queue, int64_t n, double alpha, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event scal(sycl::queue& queue, int64_t n, double alpha, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1454,10 +1454,10 @@ sycl::event scal(sycl::queue &queue, int64_t n, double alpha, std::complex &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1470,9 +1470,9 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6 return done; } -sycl::event swap(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event swap(sycl::queue& queue, int64_t n, float* x, int64_t incx, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1483,9 +1483,9 @@ sycl::event swap(sycl::queue &queue, int64_t n, float *x, int64_t incx, float *y return done; } -sycl::event swap(sycl::queue &queue, int64_t n, double *x, int64_t incx, double *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event swap(sycl::queue& queue, int64_t n, double* x, int64_t incx, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1496,10 +1496,10 @@ sycl::event swap(sycl::queue &queue, int64_t n, double *x, int64_t incx, double return done; } -sycl::event swap(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1510,10 +1510,10 @@ sycl::event swap(sycl::queue &queue, int64_t n, std::complex *x, int64_t return done; } -sycl::event swap(sycl::queue &queue, int64_t n, std::complex *x, int64_t incx, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event swap(sycl::queue& queue, int64_t n, std::complex* x, int64_t incx, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); diff --git a/src/blas/backends/netlib/netlib_level2.cxx b/src/blas/backends/netlib/netlib_level2.cxx index 156ed133b..8e8d74446 100644 --- a/src/blas/backends/netlib/netlib_level2.cxx +++ b/src/blas/backends/netlib/netlib_level2.cxx @@ -19,10 +19,10 @@ // Buffer APIs -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, float beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -36,10 +36,10 @@ void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, }); } -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, double beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -53,46 +53,46 @@ void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, }); } -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const int)kl, (const int)ku, (const void *)&alpha, + (const int)kl, (const int)ku, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, - (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const int)kl, (const int)ku, (const void *)&alpha, + (const int)kl, (const int)ku, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_x.GET_MULTI_PTR, - (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -105,10 +105,10 @@ void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha }); } -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -121,44 +121,44 @@ void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alph }); } -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); @@ -170,10 +170,10 @@ void ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); @@ -185,142 +185,142 @@ void ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, sycl::buffer alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { - ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, + ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { - ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, + ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { - ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, + ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { - ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, + ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_x.GET_MULTI_PTR, (const int)incx, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_x.GET_MULTI_PTR, (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { @@ -331,10 +331,10 @@ void her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, }); } -void her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { @@ -345,78 +345,78 @@ void her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, }); } -void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_a.GET_MULTI_PTR, (const int)lda); }); }); } -void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, - int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, - (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR, + (const void*)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, - int64_t incx, std::complex beta, sycl::buffer, 1> &y, +void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, + int64_t incx, std::complex beta, sycl::buffer, 1>& y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); host_task(cgh, [=]() { ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, - (const int)incx, (const void *)&beta, accessor_y.GET_MULTI_PTR, + (const void*)&alpha, accessor_ap.GET_MULTI_PTR, accessor_x.GET_MULTI_PTR, + (const int)incx, (const void*)&beta, accessor_y.GET_MULTI_PTR, (const int)incy); }); }); } -void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &ap) { - queue.submit([&](sycl::handler &cgh) { +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { @@ -427,10 +427,10 @@ void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, }); } -void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &ap) { - queue.submit([&](sycl::handler &cgh) { +void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { @@ -441,42 +441,42 @@ void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, }); } -void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &ap) { - queue.submit([&](sycl::handler &cgh) { +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); }); }); } -void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &ap) { - queue.submit([&](sycl::handler &cgh) { +void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, + (const void*)&alpha, accessor_x.GET_MULTI_PTR, (const int)incx, accessor_y.GET_MULTI_PTR, (const int)incy, accessor_ap.GET_MULTI_PTR); }); }); } -void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -489,10 +489,10 @@ void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alph }); } -void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -505,10 +505,10 @@ void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alp }); } -void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &ap, - sycl::buffer &x, int64_t incx, float beta, sycl::buffer &y, +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& ap, + sycl::buffer& x, int64_t incx, float beta, sycl::buffer& y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -521,10 +521,10 @@ void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::bu }); } -void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - sycl::buffer &ap, sycl::buffer &x, int64_t incx, double beta, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + sycl::buffer& ap, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -537,9 +537,9 @@ void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, }); } -void spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &ap) { - queue.submit([&](sycl::handler &cgh) { +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { @@ -550,9 +550,9 @@ void spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buf }); } -void spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &ap) { - queue.submit([&](sycl::handler &cgh) { +void spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_ap = ap.get_access(cgh); host_task(cgh, [=]() { @@ -563,9 +563,9 @@ void spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::bu }); } -void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &ap) { - queue.submit([&](sycl::handler &cgh) { +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_ap = ap.get_access(cgh); @@ -577,9 +577,9 @@ void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::bu }); } -void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &ap) { - queue.submit([&](sycl::handler &cgh) { +void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& ap) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_ap = ap.get_access(cgh); @@ -591,10 +591,10 @@ void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::b }); } -void symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx, float beta, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, float beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -607,10 +607,10 @@ void symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::bu }); } -void symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &a, - int64_t lda, sycl::buffer &x, int64_t incx, double beta, - sycl::buffer &y, int64_t incy) { - queue.submit([&](sycl::handler &cgh) { +void symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& a, + int64_t lda, sycl::buffer& x, int64_t incx, double beta, + sycl::buffer& y, int64_t incy) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); @@ -623,9 +623,9 @@ void symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::b }); } -void syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { @@ -636,9 +636,9 @@ void syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buf }); } -void syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { +void syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& a, int64_t lda) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_a = a.get_access(cgh); host_task(cgh, [=]() { @@ -649,10 +649,10 @@ void syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::bu }); } -void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); @@ -665,10 +665,10 @@ void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, sycl::bu }); } -void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_x = x.get_access(cgh); auto accessor_y = y.get_access(cgh); auto accessor_a = a.get_access(cgh); @@ -681,10 +681,10 @@ void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, sycl::b }); } -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -696,10 +696,10 @@ void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -711,10 +711,10 @@ void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -726,10 +726,10 @@ void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -741,10 +741,10 @@ void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -756,10 +756,10 @@ void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -771,10 +771,10 @@ void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -786,10 +786,10 @@ void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -801,9 +801,9 @@ void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &ap, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -814,9 +814,9 @@ void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &ap, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -827,10 +827,10 @@ void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -841,10 +841,10 @@ void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, +void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -855,9 +855,9 @@ void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &ap, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -868,9 +868,9 @@ void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &ap, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& ap, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -881,10 +881,10 @@ void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -895,10 +895,10 @@ void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &ap, sycl::buffer, 1> &x, +void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& ap, sycl::buffer, 1>& x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_ap = ap.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -909,9 +909,9 @@ void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -922,9 +922,9 @@ void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag }); } -void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -935,10 +935,10 @@ void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag }); } -void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -949,10 +949,10 @@ void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag }); } -void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -963,9 +963,9 @@ void trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag }); } -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -976,9 +976,9 @@ void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -989,10 +989,10 @@ void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -1003,10 +1003,10 @@ void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, }); } -void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { - queue.submit([&](sycl::handler &cgh) { +void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_x = x.get_access(cgh); host_task(cgh, [=]() { @@ -1019,10 +1019,10 @@ void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, // USM APIs -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - float alpha, const float *a, int64_t lda, const float *x, int64_t incx, float beta, - float *y, int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + float alpha, const float* a, int64_t lda, const float* x, int64_t incx, float beta, + float* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1036,11 +1036,11 @@ sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int6 return done; } -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - double alpha, const double *a, int64_t lda, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + double alpha, const double* a, int64_t lda, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1054,48 +1054,48 @@ sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int6 return done; } -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_cgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const int)kl, (const int)ku, (const void *)&alpha, a, (const int)lda, x, - (const int)incx, (const void *)&beta, y, (const int)incy); + (const int)kl, (const int)ku, (const void*)&alpha, a, (const int)lda, x, + (const int)incx, (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zgbmv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const int)kl, (const int)ku, (const void *)&alpha, a, (const int)lda, x, - (const int)incx, (const void *)&beta, y, (const int)incy); + (const int)kl, (const int)ku, (const void*)&alpha, a, (const int)lda, x, + (const int)incx, (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y, - int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1109,10 +1109,10 @@ sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, floa return done; } -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - const double *a, int64_t lda, const double *x, int64_t incx, double beta, - double *y, int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1126,48 +1126,48 @@ sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, doub return done; } -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_cgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zgemv(MAJOR, convert_to_cblas_trans(trans), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, const float *x, int64_t incx, - const float *y, int64_t incy, float *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, float alpha, const float* x, int64_t incx, + const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1180,10 +1180,10 @@ sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, float alpha, const flo return done; } -sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event ger(sycl::queue& queue, int64_t m, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1196,152 +1196,152 @@ sycl::event ger(sycl::queue &queue, int64_t m, int64_t n, double alpha, const do return done; } -sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x, + ::cblas_cgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event gerc(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gerc(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x, + ::cblas_zgerc(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x, + ::cblas_cgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event geru(sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event geru(sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { - ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void *)&alpha, x, + ::cblas_zgeru(MAJOR, (const int)m, (const int)n, (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_chbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zhbmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, (const int)k, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *x, - int64_t incx, std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_chemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, const std::complex *x, - int64_t incx, std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* a, int64_t lda, const std::complex* x, + int64_t incx, std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zhemv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, a, (const int)lda, x, (const int)incx, - (const void *)&beta, y, (const int)incy); + (const void*)&alpha, a, (const int)lda, x, (const int)incx, + (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - const std::complex *x, int64_t incx, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1354,10 +1354,10 @@ sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, return done; } -sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - const std::complex *x, int64_t incx, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1370,82 +1370,82 @@ sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_cher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, x, (const int)incx, y, (const int)incy, a, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zher2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, x, (const int)incx, y, (const int)incy, a, + (const void*)&alpha, x, (const int)incx, y, (const int)incy, a, (const int)lda); }); }); return done; } -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *ap, const std::complex *x, int64_t incx, - std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* ap, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_chpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, ap, x, (const int)incx, (const void *)&beta, y, + (const void*)&alpha, ap, x, (const int)incx, (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *ap, const std::complex *x, int64_t incx, - std::complex beta, std::complex *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* ap, const std::complex* x, int64_t incx, + std::complex beta, std::complex* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zhpmv(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, ap, x, (const int)incx, (const void *)&beta, y, + (const void*)&alpha, ap, x, (const int)incx, (const void*)&beta, y, (const int)incy); }); }); return done; } -sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, - const std::complex *x, int64_t incx, std::complex *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, + const std::complex* x, int64_t incx, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1458,10 +1458,10 @@ sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, return done; } -sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, - const std::complex *x, int64_t incx, std::complex *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, + const std::complex* x, int64_t incx, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1474,44 +1474,44 @@ sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_chpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, x, (const int)incx, y, (const int)incy, ap); + (const void*)&alpha, x, (const int)incx, y, (const int)incy, ap); }); }); return done; } -sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zhpr2(MAJOR, convert_to_cblas_uplo(upper_lower), (const int)n, - (const void *)&alpha, x, (const int)incx, y, (const int)incy, ap); + (const void*)&alpha, x, (const int)incx, y, (const int)incy, ap); }); }); return done; } -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, float alpha, - const float *a, int64_t lda, const float *x, int64_t incx, float beta, float *y, - int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, float alpha, + const float* a, int64_t lda, const float* x, int64_t incx, float beta, float* y, + int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1525,10 +1525,10 @@ sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, flo return done; } -sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, double alpha, - const double *a, int64_t lda, const double *x, int64_t incx, double beta, - double *y, int64_t incy, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, double alpha, + const double* a, int64_t lda, const double* x, int64_t incx, double beta, + double* y, int64_t incy, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1542,10 +1542,10 @@ sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, dou return done; } -sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *ap, - const float *x, int64_t incx, float beta, float *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* ap, + const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1559,10 +1559,10 @@ sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, c return done; } -sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *ap, - const double *x, int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* ap, + const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1576,9 +1576,9 @@ sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, float *ap, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* ap, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1591,9 +1591,9 @@ sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, co return done; } -sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, double *ap, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* ap, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1606,10 +1606,10 @@ sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, c return done; } -sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, const float *y, int64_t incy, float *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1622,10 +1622,10 @@ sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, c return done; } -sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *ap, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* ap, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1638,10 +1638,10 @@ sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *a, - int64_t lda, const float *x, int64_t incx, float beta, float *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* a, + int64_t lda, const float* x, int64_t incx, float beta, float* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1655,10 +1655,10 @@ sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, c return done; } -sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *a, - int64_t lda, const double *x, int64_t incx, double beta, double *y, int64_t incy, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* a, + int64_t lda, const double* x, int64_t incx, double beta, double* y, int64_t incy, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1672,9 +1672,9 @@ sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, float *a, int64_t lda, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, float* a, int64_t lda, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1687,10 +1687,10 @@ sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, co return done; } -sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, double *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1703,10 +1703,10 @@ sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, c return done; } -sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, const float *x, - int64_t incx, const float *y, int64_t incy, float *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, float alpha, const float* x, + int64_t incx, const float* y, int64_t incy, float* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1720,10 +1720,10 @@ sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, float alpha, c return done; } -sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, const double *x, - int64_t incx, const double *y, int64_t incy, double *a, int64_t lda, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, double alpha, const double* x, + int64_t incx, const double* y, int64_t incy, double* a, int64_t lda, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1737,10 +1737,10 @@ sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, double alpha, return done; } -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1754,10 +1754,10 @@ sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1771,10 +1771,10 @@ sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1788,10 +1788,10 @@ sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1805,10 +1805,10 @@ sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1822,10 +1822,10 @@ sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1839,10 +1839,10 @@ sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1856,10 +1856,10 @@ sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - int64_t k, const std::complex *a, int64_t lda, std::complex *x, - int64_t incx, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + int64_t k, const std::complex* a, int64_t lda, std::complex* x, + int64_t incx, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1873,10 +1873,10 @@ sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *ap, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* ap, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1889,10 +1889,10 @@ sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *ap, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* ap, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1905,10 +1905,10 @@ sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *ap, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1921,10 +1921,10 @@ sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *ap, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1937,10 +1937,10 @@ sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *ap, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* ap, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1953,10 +1953,10 @@ sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *ap, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* ap, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1969,10 +1969,10 @@ sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *ap, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1985,10 +1985,10 @@ sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *ap, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* ap, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2001,10 +2001,10 @@ sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - const float *a, int64_t lda, float *b, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2018,10 +2018,10 @@ sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag un return done; } -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - const double *a, int64_t lda, double *b, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2035,10 +2035,10 @@ sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag un return done; } -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *b, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2052,10 +2052,10 @@ sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag un return done; } -sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *b, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose transa, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* b, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2069,10 +2069,10 @@ sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose transa, diag un return done; } -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const float *a, int64_t lda, float *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const float* a, int64_t lda, float* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2086,10 +2086,10 @@ sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const double *a, int64_t lda, double *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const double* a, int64_t lda, double* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2103,10 +2103,10 @@ sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -2120,10 +2120,10 @@ sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag uni return done; } -sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, - const std::complex *a, int64_t lda, std::complex *x, int64_t incx, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, + const std::complex* a, int64_t lda, std::complex* x, int64_t incx, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); diff --git a/src/blas/backends/netlib/netlib_level3.cxx b/src/blas/backends/netlib/netlib_level3.cxx index 8bb6a04ae..2579e66e1 100644 --- a/src/blas/backends/netlib/netlib_level3.cxx +++ b/src/blas/backends/netlib/netlib_level3.cxx @@ -19,10 +19,10 @@ // Buffer APIs -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -36,10 +36,10 @@ void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int }); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, double beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -53,46 +53,46 @@ void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int }); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), - (const int)m, (const int)n, (const int)k, (const void *)&alpha, + (const int)m, (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, - (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), - (const int)m, (const int)n, (const int)k, (const void *)&alpha, + (const int)m, (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, - (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - sycl::half alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, sycl::half beta, - sycl::buffer &c, int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + sycl::half alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, sycl::half beta, + sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -101,9 +101,9 @@ void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int #endif } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -112,9 +112,9 @@ void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int #endif } -void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - float alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, float beta, sycl::buffer &c, int64_t ldc) { +void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, + float alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -123,46 +123,46 @@ void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int #endif } -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer, 1> &a, int64_t lda, float beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer, 1>& a, int64_t lda, float beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { @@ -174,10 +174,10 @@ void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int6 }); } -void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, - sycl::buffer, 1> &a, int64_t lda, double beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer, 1>& a, int64_t lda, double beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { @@ -189,17 +189,17 @@ void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int6 }); } -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, float beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, float beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb, (const float)beta, accessor_c.GET_MULTI_PTR, (const int)ldc); @@ -207,17 +207,17 @@ void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int }); } -void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, double beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, double beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb, (const double)beta, accessor_c.GET_MULTI_PTR, (const int)ldc); @@ -225,10 +225,10 @@ void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int }); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - float beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -242,10 +242,10 @@ void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int6 }); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - double beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, double alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -259,46 +259,46 @@ void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int6 }); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, - accessor_b.GET_MULTI_PTR, (const int)ldb, (const void *)&beta, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + accessor_b.GET_MULTI_PTR, (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, float beta, sycl::buffer &c, +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, float beta, sycl::buffer& c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { @@ -310,10 +310,10 @@ void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int6 }); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, - sycl::buffer &a, int64_t lda, double beta, sycl::buffer &c, +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, double alpha, + sycl::buffer& a, int64_t lda, double beta, sycl::buffer& c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { @@ -325,40 +325,40 @@ void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int6 }); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, - accessor_a.GET_MULTI_PTR, (const int)lda, (const void *)&beta, - accessor_c.GET_MULTI_PTR, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); }); }); } -void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, - accessor_a.GET_MULTI_PTR, (const int)lda, (const void *)&beta, - accessor_c.GET_MULTI_PTR, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, + (const int)lda, (const void*)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldc); }); }); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb, - float beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, float alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb, + float beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -371,10 +371,10 @@ void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int }); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, double beta, sycl::buffer &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); @@ -388,46 +388,46 @@ void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int }); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, - (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, int64_t ldc) { - queue.submit([&](sycl::handler &cgh) { +void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, int64_t ldc) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); auto accessor_c = c.get_access(cgh); host_task(cgh, [=]() { ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, + (const int)n, (const int)k, (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, - (const int)ldb, (const void *)&beta, accessor_c.GET_MULTI_PTR, + (const int)ldb, (const void*)&beta, accessor_c.GET_MULTI_PTR, (const int)ldc); }); }); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -440,10 +440,10 @@ void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans }); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -456,43 +456,43 @@ void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans }); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb); }); }); } -void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb); }); }); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -505,10 +505,10 @@ void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans }); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { @@ -521,34 +521,34 @@ void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans }); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, - int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, + int64_t m, int64_t n, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb); }); }); } -void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, +void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &b, int64_t ldb) { - queue.submit([&](sycl::handler &cgh) { + sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& b, int64_t ldb) { + queue.submit([&](sycl::handler& cgh) { auto accessor_a = a.get_access(cgh); auto accessor_b = b.get_access(cgh); host_task(cgh, [=]() { ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, + (const void*)&alpha, accessor_a.GET_MULTI_PTR, (const int)lda, accessor_b.GET_MULTI_PTR, (const int)ldb); }); }); @@ -556,10 +556,10 @@ void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans // USM APIs -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, - float beta, float *c, int64_t ldc, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, + float beta, float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -573,11 +573,11 @@ sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t return done; } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -591,50 +591,48 @@ sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t return done; } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_cgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), - (const int)m, (const int)n, (const int)k, (const void *)&alpha, a, - (const int)lda, b, (const int)ldb, (const void *)&beta, c, - (const int)ldc); + (const int)m, (const int)n, (const int)k, (const void*)&alpha, a, + (const int)lda, b, (const int)ldb, (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zgemm(MAJOR, convert_to_cblas_trans(transa), convert_to_cblas_trans(transb), - (const int)m, (const int)n, (const int)k, (const void *)&alpha, a, - (const int)lda, b, (const int)ldb, (const void *)&beta, c, - (const int)ldc); + (const int)m, (const int)n, (const int)k, (const void*)&alpha, a, + (const int)lda, b, (const int)ldb, (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, sycl::half alpha, const sycl::half *a, int64_t lda, const sycl::half *b, - int64_t ldb, sycl::half beta, sycl::half *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, sycl::half alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, sycl::half beta, sycl::half* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -643,10 +641,10 @@ sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const sycl::half *a, int64_t lda, const sycl::half *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const sycl::half* a, int64_t lda, const sycl::half* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -655,10 +653,10 @@ sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const bfloat16 *a, int64_t lda, const bfloat16 *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, float alpha, const bfloat16* a, int64_t lda, const bfloat16* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { #ifdef COLUMN_MAJOR throw unimplemented("blas", "gemm", "for column_major layout"); #endif @@ -667,12 +665,12 @@ sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t #endif } -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -680,19 +678,19 @@ sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t host_task(cgh, [=]() { ::cblas_chemm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb, - (const void *)&beta, c, (const int)ldc); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -700,18 +698,18 @@ sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t host_task(cgh, [=]() { ::cblas_zhemm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb, - (const void *)&beta, c, (const int)ldc); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const std::complex *a, int64_t lda, float beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const std::complex* a, int64_t lda, float beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -725,11 +723,11 @@ sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const std::complex *a, int64_t lda, double beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const std::complex* a, int64_t lda, double beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -743,46 +741,46 @@ sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, float beta, std::complex *c, - int64_t ldc, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, float beta, std::complex* c, + int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_cher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b, + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, (const int)ldb, (const float)beta, c, (const int)ldc); }); }); return done; } -sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, double beta, std::complex *c, - int64_t ldc, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, double beta, std::complex* c, + int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zher2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b, + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, (const int)ldb, (const double)beta, c, (const int)ldc); }); }); return done; } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta, - float *c, int64_t ldc, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -797,11 +795,11 @@ sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t return done; } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - double alpha, const double *a, int64_t lda, const double *b, int64_t ldb, - double beta, double *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -816,12 +814,12 @@ sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t return done; } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -829,19 +827,19 @@ sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t host_task(cgh, [=]() { ::cblas_csymm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb, - (const void *)&beta, c, (const int)ldc); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -849,17 +847,17 @@ sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t host_task(cgh, [=]() { ::cblas_zsymm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb, - (const void *)&beta, c, (const int)ldc); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -873,10 +871,10 @@ sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -890,46 +888,46 @@ sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_csyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, - (const void *)&beta, c, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zsyrk(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, - (const void *)&beta, c, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, + (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, float beta, - float *c, int64_t ldc, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, float beta, + float* c, int64_t ldc, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -943,11 +941,11 @@ sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, const double *b, int64_t ldb, - double beta, double *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + double alpha, const double* a, int64_t lda, const double* b, int64_t ldb, + double beta, double* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -961,48 +959,48 @@ sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t return done; } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_csyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b, - (const int)ldb, (const void *)&beta, c, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } host_task(cgh, [=]() { ::cblas_zsyr2k(MAJOR, convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - (const int)n, (const int)k, (const void *)&alpha, a, (const int)lda, b, - (const int)ldb, (const void *)&beta, c, (const int)ldc); + (const int)n, (const int)k, (const void*)&alpha, a, (const int)lda, b, + (const int)ldb, (const void*)&beta, c, (const int)ldc); }); }); return done; } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, - float *b, int64_t ldb, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1017,10 +1015,10 @@ sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpos return done; } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda, - double *b, int64_t ldb, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1035,11 +1033,11 @@ sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpos return done; } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1048,17 +1046,17 @@ sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpos ::cblas_ctrmm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); }); }); return done; } -sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1067,16 +1065,16 @@ sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpos ::cblas_ztrmm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); }); }); return done; } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, int64_t m, int64_t n, float alpha, const float *a, int64_t lda, - float *b, int64_t ldb, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, float alpha, const float* a, int64_t lda, + float* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1091,10 +1089,10 @@ sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpos return done; } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, - diag unit_diag, int64_t m, int64_t n, double alpha, const double *a, int64_t lda, - double *b, int64_t ldb, const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, + diag unit_diag, int64_t m, int64_t n, double alpha, const double* a, int64_t lda, + double* b, int64_t ldb, const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1109,11 +1107,11 @@ sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpos return done; } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1122,17 +1120,17 @@ sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpos ::cblas_ctrsm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); }); }); return done; } -sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose transa, +sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose transa, diag unit_diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - const std::vector &dependencies) { - auto done = queue.submit([&](sycl::handler &cgh) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + const std::vector& dependencies) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); @@ -1141,7 +1139,7 @@ sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpos ::cblas_ztrsm(MAJOR, convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(transa), convert_to_cblas_diag(unit_diag), (const int)m, (const int)n, - (const void *)&alpha, a, (const int)lda, b, (const int)ldb); + (const void*)&alpha, a, (const int)lda, b, (const int)ldb); }); }); return done; diff --git a/src/blas/backends/portblas/portblas_batch.cxx b/src/blas/backends/portblas/portblas_batch.cxx index 28c7ee5dc..2fe63a127 100644 --- a/src/blas/backends/portblas/portblas_batch.cxx +++ b/src/blas/backends/portblas/portblas_batch.cxx @@ -19,999 +19,999 @@ // Buffer APIs -void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, +void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "syrk_batch", ""); } -void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &c, +void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "syrk_batch", ""); } -void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "syrk_batch", ""); } -void syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "syrk_batch", ""); } -void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, float beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "gemv_batch", ""); } -void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, double beta, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, double beta, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "gemv_batch", ""); } -void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "gemv_batch", ""); } -void gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "gemv_batch", ""); } -void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", ""); } -void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", ""); } -void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", ""); } -void dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { throw unimplemented("blas", "dgmm_batch", ""); } -void axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer &y, +void axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "axpy_batch", ""); } -void axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "axpy_batch", ""); } -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, std::int64_t incy, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "copy_batch", ""); } -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, std::int64_t incy, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "copy_batch", ""); } -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "copy_batch", ""); } -void copy_batch(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void copy_batch(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { throw unimplemented("blas", "copy_batch", ""); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for complex"); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for complex"); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for complex"); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for unsupported dtype"); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for unsupported dtype"); } -void gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "gemm_batch", " for unsupported dtype"); } -void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "trsm_batch", ""); } -void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "trsm_batch", ""); } -void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "trsm_batch", ""); } -void trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "trsm_batch", ""); } -void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", ""); } -void omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +void omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("blas", "omatcopy_batch", ""); } -void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", ""); } -void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", ""); } -void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", ""); } -void imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +void imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", ""); } -void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, +void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer &c, +void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { CALL_PORTBLAS_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", ""); } -void omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { throw unimplemented("blas", "omatadd_batch", ""); } // USM APIs -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event syrk_batch(sycl::queue &queue, oneapi::mkl::uplo upper_lower, +sycl::event syrk_batch(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "syrk_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, std::int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, std::int64_t incy, +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, std::complex beta, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float *beta, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float* beta, float** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double *beta, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double* beta, double** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event gemv_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemv_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, const float *a, std::int64_t lda, std::int64_t stridea, - const float *x, std::int64_t incx, std::int64_t stridex, float *c, +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, const float* a, std::int64_t lda, std::int64_t stridea, + const float* x, std::int64_t incx, std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, const double *a, std::int64_t lda, std::int64_t stridea, - const double *x, std::int64_t incx, std::int64_t stridex, double *c, +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, const double* a, std::int64_t lda, std::int64_t stridea, + const double* x, std::int64_t incx, std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *c, std::int64_t ldc, +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, - std::int64_t n, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *c, std::int64_t ldc, +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, + std::int64_t n, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, - std::int64_t *n, const float **a, std::int64_t *lda, const float **x, - std::int64_t *incx, float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, + std::int64_t* n, const float** a, std::int64_t* lda, const float** x, + std::int64_t* incx, float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, - std::int64_t *n, const double **a, std::int64_t *lda, const double **x, - std::int64_t *incx, double **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, + std::int64_t* n, const double** a, std::int64_t* lda, const double** x, + std::int64_t* incx, double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, + std::int64_t* n, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event dgmm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, - std::int64_t *n, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, + std::int64_t* n, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "dgmm_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, - std::int64_t *incx, float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, float* alpha, const float** x, + std::int64_t* incx, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, - std::int64_t *incx, double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, double* alpha, const double** x, + std::int64_t* incx, double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, float alpha, const float *x, - std::int64_t incx, std::int64_t stridex, float *y, std::int64_t incy, +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, float alpha, const float* x, + std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, double alpha, const double *x, - std::int64_t incx, std::int64_t stridex, double *y, std::int64_t incy, +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, double alpha, const double* x, + std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_axpy_batch, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event axpy_batch(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event axpy_batch(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "axpy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const float** x, std::int64_t* incx, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t *n, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t* n, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event copy_batch(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, +sycl::event copy_batch(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "copy_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, - const float **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const float** a, std::int64_t* lda, + const float** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, - const double **b, std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double* alpha, const double** a, std::int64_t* lda, + const double** b, std::int64_t* ldb, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose *transa, - oneapi::mkl::transpose *transb, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose* transa, + oneapi::mkl::transpose* transb, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, const float *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, const double *b, std::int64_t ldb, - std::int64_t stride_b, double beta, double *c, std::int64_t ldc, + std::int64_t k, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, const double* b, std::int64_t ldb, + std::int64_t stride_b, double beta, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_gemm_strided_batched, queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, const std::complex *b, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, const std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::complex beta, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + std::int64_t k, sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t k, float alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event gemm_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc, + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "gemm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float *b, + const float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double *b, + const double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side left_right, +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, float *alpha, - const float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, float* alpha, + const float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event trsm_batch(sycl::queue &queue, oneapi::mkl::side *left_right, - oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - oneapi::mkl::diag *unit_diag, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(sycl::queue& queue, oneapi::mkl::side* left_right, + oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + oneapi::mkl::diag* unit_diag, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "trsm_batch", " for USM"); } -sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, +sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatcopy_batch, queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, +sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", " for USM"); } -sycl::event omatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, +sycl::event omatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", " for USM"); } -sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, float *ab, std::int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", " for USM"); } -sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, double *ab, std::int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", " for USM"); } -sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, +sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", " for USM"); } -sycl::event imatcopy_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, std::complex *ab, +sycl::event imatcopy_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", " for USM"); } -sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - float beta, const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + float beta, const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - double beta, const double *b, std::int64_t ldb, std::int64_t stride_b, - double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + double beta, const double* b, std::int64_t ldb, std::int64_t stride_b, + double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatadd_batch, queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", " for USM"); } -sycl::event omatadd_batch(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event omatadd_batch(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { throw unimplemented("blas", "omatadd_batch", " for USM"); } diff --git a/src/blas/backends/portblas/portblas_gemm_bias.cxx b/src/blas/backends/portblas/portblas_gemm_bias.cxx index 30f638f3e..0b62ee674 100644 --- a/src/blas/backends/portblas/portblas_gemm_bias.cxx +++ b/src/blas/backends/portblas/portblas_gemm_bias.cxx @@ -19,72 +19,72 @@ // Buffer APIs -void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + float alpha, sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", ""); } -void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, int8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + float alpha, sycl::buffer& a, std::int64_t lda, int8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", ""); } -void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + float alpha, sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", ""); } -void gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + float alpha, sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", ""); } // USM APIs -sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, - std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", " for USM"); } -sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int8_t ao, const std::int8_t *b, std::int64_t ldb, - std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int8_t ao, const std::int8_t* b, std::int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", " for USM"); } -sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::uint8_t *a, - std::int64_t lda, std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, - std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::uint8_t* a, + std::int64_t lda, std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, + std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", " for USM"); } -sycl::event gemm_bias(sycl::queue &queue, oneapi::mkl::transpose transa, +sycl::event gemm_bias(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, - std::int64_t n, std::int64_t k, float alpha, const std::uint8_t *a, - std::int64_t lda, std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, - std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies) { + std::int64_t n, std::int64_t k, float alpha, const std::uint8_t* a, + std::int64_t lda, std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, + std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", " for USM"); } diff --git a/src/blas/backends/portblas/portblas_level1.cxx b/src/blas/backends/portblas/portblas_level1.cxx index 0a0af855c..6d1f39463 100644 --- a/src/blas/backends/portblas/portblas_level1.cxx +++ b/src/blas/backends/portblas/portblas_level1.cxx @@ -19,91 +19,91 @@ // Buffer APIs -void dotc(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { throw unimplemented("blas", "dotc", ""); } -void dotu(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { throw unimplemented("blas", "dotu", ""); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { CALL_PORTBLAS_FN(::blas::_iamax, queue, n, x, incx, result); } -void iamax(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamax(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "iamax", ""); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { CALL_PORTBLAS_FN(::blas::_iamin, queue, n, x, incx, result); } -void iamin(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void iamin(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "iamin", ""); } -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "asum", ""); } -void asum(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void asum(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { // portBLAS asum implementation requires that result is initialized to zero // before performing the computation. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto result_acc = result.template get_access(cgh); cgh.single_task([=]() { result_acc[0] = real_t(0); }); }); CALL_PORTBLAS_FN(::blas::_asum, queue, n, x, incx, result); } -void axpy(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, real_t alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_axpy, queue, n, alpha, x, incx, y, incy); } -void axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "axpy", "for complex"); } -void axpby(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer &x, - std::int64_t incx, real_t beta, sycl::buffer &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, real_t alpha, sycl::buffer& x, + std::int64_t incx, real_t beta, sycl::buffer& y, std::int64_t incy) { throw unimplemented("blas", "axpby", ""); } -void axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "axpby", ""); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_copy, queue, n, x, incx, y, incy); } -void copy(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void copy(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "copy", " for complex."); } -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result) { // portBLAS dot implementation requires that result is initialized to zero // before performing the computation. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto result_acc = result.template get_access(cgh); cgh.single_task([=]() { result_acc[0] = real_t(0); }); }); @@ -111,288 +111,288 @@ void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::in } #ifdef ENABLE_MIXED_PRECISION_WITH_DOUBLE -void dot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "dot", " for unmatched return type"); } #endif -void sdsdot(sycl::queue &queue, std::int64_t n, real_t sb, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void sdsdot(sycl::queue& queue, std::int64_t n, real_t sb, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { // portBLAS sdsdot implementation requires that result is initialized to zero // before performing the computation. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto result_acc = result.template get_access(cgh); cgh.single_task([=]() { result_acc[0] = real_t(0); }); }); CALL_PORTBLAS_FN(::blas::_sdsdot, queue, n, sb, x, incx, y, incy, result); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer& result) { throw unimplemented("blas", "nrm2", " for complex"); } -void nrm2(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& result) { // portBLAS nrm2 implementation requires that result is initialized to zero // before performing the computation. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto result_acc = result.template get_access(cgh); cgh.single_task([=]() { result_acc[0] = real_t(0); }); }); CALL_PORTBLAS_FN(::blas::_nrm2, queue, n, x, incx, result); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy, real_t c, +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy, real_t c, real_t s) { throw unimplemented("blas", "rot", " for complex"); } -void rot(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, real_t c, real_t s) { +void rot(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, real_t c, real_t s) { CALL_PORTBLAS_FN(::blas::_rot, queue, n, x, incx, y, incy, c, s); } -void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { +void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { CALL_PORTBLAS_FN(::blas::_rotg, queue, a, b, c, s); } -void rotg(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { throw unimplemented("blas", "rotg", " for complex"); } -void rotm(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, sycl::buffer ¶m) { +void rotm(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, sycl::buffer& param) { CALL_PORTBLAS_FN(::blas::_rotm, queue, n, x, incx, y, incy, param); } -void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, real_t y1, sycl::buffer ¶m) { +void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, real_t y1, sycl::buffer& param) { CALL_PORTBLAS_FN(::blas::_rotmg, queue, d1, d2, x1, y1, param); } -void scal(sycl::queue &queue, std::int64_t n, real_t alpha, sycl::buffer &x, +void scal(sycl::queue& queue, std::int64_t n, real_t alpha, sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_scal, queue, n, alpha, x, incx); } -void scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "scal", " for complex"); } -void scal(sycl::queue &queue, std::int64_t n, real_t alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(sycl::queue& queue, std::int64_t n, real_t alpha, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "scal", " for complex"); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_swap, queue, n, x, incx, y, incy); } -void swap(sycl::queue &queue, std::int64_t n, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, std::int64_t incy) { +void swap(sycl::queue& queue, std::int64_t n, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "swap", " for complex"); } // USM APIs -sycl::event dotc(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotc(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { throw unimplemented("blas", "dotc", " for USM"); } -sycl::event dotu(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, const std::complex *y, std::int64_t incy, - std::complex *result, const std::vector &dependencies) { +sycl::event dotu(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, const std::complex* y, std::int64_t incy, + std::complex* result, const std::vector& dependencies) { throw unimplemented("blas", "dotu", " for USM"); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_iamax, queue, n, x, incx, result, dependencies); } -sycl::event iamax(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { throw unimplemented("blas", "iamax", " for USM"); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, - std::int64_t *result, const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, + std::int64_t* result, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_iamin, queue, n, x, incx, result, dependencies); } -sycl::event iamin(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { throw unimplemented("blas", "iamin", " for USM"); } -sycl::event asum(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, real_t *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, real_t* result, const std::vector& dependencies) { throw unimplemented("blas", "asum", " for USM"); } -sycl::event asum(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, - real_t *result, const std::vector &dependencies) { +sycl::event asum(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, + real_t* result, const std::vector& dependencies) { // portBLAS asum implementation requires result to be initializes to zero // before starting the computation. auto init_res_val = queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); + [&](sycl::handler& cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); std::vector new_dependencies = dependencies; new_dependencies.push_back(init_res_val); CALL_PORTBLAS_USM_FN(::blas::_asum, queue, n, x, incx, result, new_dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, real_t alpha, const real_t *x, - std::int64_t incx, real_t *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, real_t alpha, const real_t* x, + std::int64_t incx, real_t* y, std::int64_t incy, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_axpy, queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event axpy(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "axpy", " for USM"); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, real_t alpha, const real_t *x, - std::int64_t incx, const real_t beta, real_t *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, real_t alpha, const real_t* x, + std::int64_t incx, const real_t beta, real_t* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", " for USM"); } -sycl::event axpby(sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", " for USM"); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, real_t *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, real_t* y, + std::int64_t incy, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_copy, queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "copy", " for USM"); } -sycl::event dot(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, - const real_t *y, std::int64_t incy, real_t *result, - const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, + const real_t* y, std::int64_t incy, real_t* result, + const std::vector& dependencies) { // portBLAS dot implementation requires result to be initializes to zero // before starting the computation. auto init_res_val = queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); + [&](sycl::handler& cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); std::vector new_dependencies = dependencies; new_dependencies.emplace_back(init_res_val); CALL_PORTBLAS_USM_FN(::blas::_dot, queue, n, x, incx, y, incy, result, new_dependencies); } #ifdef ENABLE_MIXED_PRECISION_WITH_DOUBLE -sycl::event dot(sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { throw unimplemented("blas", "dot", " for USM"); } #endif -sycl::event sdsdot(sycl::queue &queue, std::int64_t n, real_t sb, const real_t *x, - std::int64_t incx, const real_t *y, std::int64_t incy, real_t *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, std::int64_t n, real_t sb, const real_t* x, + std::int64_t incx, const real_t* y, std::int64_t incy, real_t* result, + const std::vector& dependencies) { // portBLAS sdsdot implementation requires result to be initializes to zero // before starting the computation. auto init_res_val = queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); + [&](sycl::handler& cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); std::vector new_dependencies = dependencies; new_dependencies.emplace_back(init_res_val); CALL_PORTBLAS_USM_FN(::blas::_sdsdot, queue, n, sb, x, incx, y, incy, result, new_dependencies); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const std::complex *x, - std::int64_t incx, real_t *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const std::complex* x, + std::int64_t incx, real_t* result, const std::vector& dependencies) { throw unimplemented("blas", "nrm2", " for USM"); } -sycl::event nrm2(sycl::queue &queue, std::int64_t n, const real_t *x, std::int64_t incx, - real_t *result, const std::vector &dependencies) { +sycl::event nrm2(sycl::queue& queue, std::int64_t n, const real_t* x, std::int64_t incx, + real_t* result, const std::vector& dependencies) { // portBLAS nrm2 implementation requires result to be initializes to zero // before starting the computation. auto init_res_val = queue.submit( - [&](sycl::handler &cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); + [&](sycl::handler& cgh) { cgh.single_task([=]() { result[0] = real_t(0); }); }); std::vector new_dependencies = dependencies; new_dependencies.push_back(init_res_val); CALL_PORTBLAS_USM_FN(::blas::_nrm2, queue, n, x, incx, result, new_dependencies); } -sycl::event rot(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, real_t c, real_t s, - const std::vector &dependencies) { +sycl::event rot(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, real_t c, real_t s, + const std::vector& dependencies) { throw unimplemented("blas", "rot", " for USM"); } -sycl::event rot(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y, +sycl::event rot(sycl::queue& queue, std::int64_t n, real_t* x, std::int64_t incx, real_t* y, std::int64_t incy, real_t c, real_t s, - const std::vector &dependencies) { + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_rot, queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, real_t *a, real_t *b, real_t *c, real_t *s, - const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, real_t* a, real_t* b, real_t* c, real_t* s, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_rotg, queue, a, b, c, s, dependencies); } -sycl::event rotg(sycl::queue &queue, std::complex *a, std::complex *b, real_t *c, - std::complex *s, const std::vector &dependencies) { +sycl::event rotg(sycl::queue& queue, std::complex* a, std::complex* b, real_t* c, + std::complex* s, const std::vector& dependencies) { throw unimplemented("blas", "rotg", " for USM"); } -sycl::event rotm(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y, - std::int64_t incy, real_t *param, const std::vector &dependencies) { +sycl::event rotm(sycl::queue& queue, std::int64_t n, real_t* x, std::int64_t incx, real_t* y, + std::int64_t incy, real_t* param, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_rotm, queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotmg(sycl::queue &queue, real_t *d1, real_t *d2, real_t *x1, real_t y1, real_t *param, - const std::vector &dependencies) { +sycl::event rotmg(sycl::queue& queue, real_t* d1, real_t* d2, real_t* x1, real_t y1, real_t* param, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_rotmg, queue, d1, d2, x1, y1, param, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, real_t alpha, real_t *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, real_t alpha, real_t* x, std::int64_t incx, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_scal, queue, n, alpha, x, incx, dependencies); } -sycl::event scal(sycl::queue &queue, std::int64_t n, std::complex alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, std::complex alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "scal", " for USM"); } -sycl::event scal(sycl::queue &queue, std::int64_t n, real_t alpha, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(sycl::queue& queue, std::int64_t n, real_t alpha, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "scal", " for USM"); } -sycl::event swap(sycl::queue &queue, std::int64_t n, real_t *x, std::int64_t incx, real_t *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, real_t* x, std::int64_t incx, real_t* y, + std::int64_t incy, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_swap, queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(sycl::queue &queue, std::int64_t n, std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(sycl::queue& queue, std::int64_t n, std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "swap", " for USM"); } diff --git a/src/blas/backends/portblas/portblas_level2.cxx b/src/blas/backends/portblas/portblas_level2.cxx index b3d8b6766..a99077a51 100644 --- a/src/blas/backends/portblas/portblas_level2.cxx +++ b/src/blas/backends/portblas/portblas_level2.cxx @@ -19,452 +19,452 @@ // Buffer APIs -void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - real_t alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, real_t beta, sycl::buffer &y, std::int64_t incy) { +void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + real_t alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, real_t beta, sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_gemv, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "gemv", " for complex"); } -void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, real_t alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, real_t beta, - sycl::buffer &y, std::int64_t incy) { +void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, real_t alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, real_t beta, + sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_gbmv, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, +void gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "gbmv", " for complex"); } -void ger(sycl::queue &queue, std::int64_t m, std::int64_t n, real_t alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(sycl::queue& queue, std::int64_t m, std::int64_t n, real_t alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { CALL_PORTBLAS_FN(::blas::_ger, queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { throw unimplemented("blas", "gerc", ""); } -void geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { throw unimplemented("blas", "geru", ""); } -void hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "hbmv", ""); } -void hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "hemv", ""); } -void her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { throw unimplemented("blas", "her", ""); } -void her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { throw unimplemented("blas", "her2", ""); } -void hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { throw unimplemented("blas", "hpmv", ""); } -void hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { throw unimplemented("blas", "hpr", ""); } -void hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { throw unimplemented("blas", "hpr2", ""); } -void sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - real_t alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, real_t beta, sycl::buffer &y, std::int64_t incy) { +void sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + real_t alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, real_t beta, sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_sbmv, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, real_t beta, sycl::buffer &y, std::int64_t incy) { +void symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, real_t beta, sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_symv, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { CALL_PORTBLAS_FN(::blas::_syr, queue, upper_lower, n, alpha, x, incx, a, lda); } -void syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { CALL_PORTBLAS_FN(::blas::_syr2, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &a, sycl::buffer &x, std::int64_t incx, real_t beta, - sycl::buffer &y, std::int64_t incy) { +void spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& a, sycl::buffer& x, std::int64_t incx, real_t beta, + sycl::buffer& y, std::int64_t incy) { CALL_PORTBLAS_FN(::blas::_spmv, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { +void spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { CALL_PORTBLAS_FN(::blas::_spr, queue, upper_lower, n, alpha, x, incx, a); } -void spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { +void spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { CALL_PORTBLAS_FN(::blas::_spr2, queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_tbmv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "tbmv", ""); } -void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_tbsv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "tbsv", ""); } -void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_tpmv, queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "tpmv", ""); } -void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx) { +void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_tpsv, queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "tpsv", ""); } -void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_trmv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "trmv", " for complex"); } -void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { CALL_PORTBLAS_FN(::blas::_trsv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { throw unimplemented("blas", "trsv", ""); } // USM APIs -sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - real_t alpha, const real_t *a, std::int64_t lda, const real_t *x, - std::int64_t incx, real_t beta, real_t *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + real_t alpha, const real_t* a, std::int64_t lda, const real_t* x, + std::int64_t incx, real_t beta, real_t* y, std::int64_t incy, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_gemv, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "gemv", " for USM"); } -sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, real_t alpha, const real_t *a, std::int64_t lda, - const real_t *x, std::int64_t incx, real_t beta, real_t *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, real_t alpha, const real_t* a, std::int64_t lda, + const real_t* x, std::int64_t incx, real_t beta, real_t* y, std::int64_t incy, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_gbmv, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, +sycl::event gbmv(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { throw unimplemented("blas", "gbmv", " for USM"); } -sycl::event ger(sycl::queue &queue, std::int64_t m, std::int64_t n, real_t alpha, const real_t *x, - std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event ger(sycl::queue& queue, std::int64_t m, std::int64_t n, real_t alpha, const real_t* x, + std::int64_t incx, const real_t* y, std::int64_t incy, real_t* a, std::int64_t lda, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_ger, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event gerc(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { throw unimplemented("blas", "gerc", " for USM"); } -sycl::event geru(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event geru(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, + const std::vector& dependencies) { throw unimplemented("blas", "geru", " for USM"); } -sycl::event hbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "hbmv", " for USM"); } -sycl::event hemv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hemv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "hemv", " for USM"); } -sycl::event her(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "her", " for USM"); } -sycl::event her2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { throw unimplemented("blas", "her2", " for USM"); } -sycl::event hpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "hpmv", " for USM"); } -sycl::event hpr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const std::complex *x, std::int64_t incx, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const std::complex* x, std::int64_t incx, std::complex* a, + const std::vector& dependencies) { throw unimplemented("blas", "hpr", " for USM"); } -sycl::event hpr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { throw unimplemented("blas", "hpr2", " for USM"); } -sycl::event sbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - real_t alpha, const real_t *a, std::int64_t lda, const real_t *x, - std::int64_t incx, real_t beta, real_t *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event sbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + real_t alpha, const real_t* a, std::int64_t lda, const real_t* x, + std::int64_t incx, real_t beta, real_t* y, std::int64_t incy, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_sbmv, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event symv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *a, std::int64_t lda, const real_t *x, std::int64_t incx, real_t beta, - real_t *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event symv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* a, std::int64_t lda, const real_t* x, std::int64_t incx, real_t beta, + real_t* y, std::int64_t incy, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_symv, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event syr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *x, std::int64_t incx, real_t *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* x, std::int64_t incx, real_t* a, std::int64_t lda, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_syr, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *x, std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event syr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* x, std::int64_t incx, const real_t* y, std::int64_t incy, real_t* a, + std::int64_t lda, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_syr2, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event spmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *a, const real_t *x, std::int64_t incx, real_t beta, real_t *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event spmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* a, const real_t* x, std::int64_t incx, real_t beta, real_t* y, + std::int64_t incy, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_spmv, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spr(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *x, std::int64_t incx, real_t *a, - const std::vector &dependencies) { +sycl::event spr(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* x, std::int64_t incx, real_t* a, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_spr, queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event spr2(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, - const real_t *x, std::int64_t incx, const real_t *y, std::int64_t incy, real_t *a, - const std::vector &dependencies) { +sycl::event spr2(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, real_t alpha, + const real_t* x, std::int64_t incx, const real_t* y, std::int64_t incy, real_t* a, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_spr2, queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t *a, - std::int64_t lda, real_t *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t* a, + std::int64_t lda, real_t* x, std::int64_t incx, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_tbmv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +sycl::event tbmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "tbmv", " for USM"); } -sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t *a, - std::int64_t lda, real_t *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, const real_t* a, + std::int64_t lda, real_t* x, std::int64_t incx, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_tbsv, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +sycl::event tbsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - const std::complex *a, std::int64_t lda, std::complex *x, - std::int64_t incx, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* x, + std::int64_t incx, const std::vector& dependencies) { throw unimplemented("blas", "tbsv", " for USM"); } -sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, real_t *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const real_t* a, real_t* x, + std::int64_t incx, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_tpmv, queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tpmv", " for USM"); } -sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, real_t *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const real_t* a, real_t* x, + std::int64_t incx, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_tpsv, queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "tpsv", " for USM"); } -sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, std::int64_t lda, - real_t *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const real_t* a, std::int64_t lda, + real_t* x, std::int64_t incx, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_trmv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "trmv", " for USM"); } -sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const real_t *a, std::int64_t lda, - real_t *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const real_t* a, std::int64_t lda, + real_t* x, std::int64_t incx, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_trsv, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + oneapi::mkl::diag unit_diag, std::int64_t n, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { throw unimplemented("blas", "trsv", " for USM"); } diff --git a/src/blas/backends/portblas/portblas_level3.cxx b/src/blas/backends/portblas/portblas_level3.cxx index 4eeb1e8f1..57c6f25b1 100644 --- a/src/blas/backends/portblas/portblas_level3.cxx +++ b/src/blas/backends/portblas/portblas_level3.cxx @@ -19,19 +19,19 @@ // Buffer APIs -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, real_t beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, real_t beta, + sycl::buffer& c, std::int64_t ldc) { CALL_PORTBLAS_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { using sycl_complex_real_t = sycl::ext::oneapi::experimental::complex; if (transa == oneapi::mkl::transpose::conjtrans || transb == oneapi::mkl::transpose::conjtrans) { @@ -63,184 +63,184 @@ void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transp queue.copy(out_pb_acc, out_acc); } -void symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, real_t alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, real_t beta, - sycl::buffer &c, std::int64_t ldc) { +void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, real_t alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, real_t beta, + sycl::buffer& c, std::int64_t ldc) { CALL_PORTBLAS_FN(::blas::_symm, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "symm", ""); } -void hemm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void hemm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "hemm", ""); } -void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer &a, - std::int64_t lda, real_t beta, sycl::buffer &c, std::int64_t ldc) { +void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer& a, + std::int64_t lda, real_t beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "syrk", ""); } -void syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "syrk", ""); } -void herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer, 1> &a, - std::int64_t lda, real_t beta, sycl::buffer, 1> &c, +void herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer, 1>& a, + std::int64_t lda, real_t beta, sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "herk", ""); } -void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, real_t beta, - sycl::buffer &c, std::int64_t ldc) { +void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, real_t beta, + sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "syr2k", ""); } -void syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "syr2k", ""); } -void her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +void her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, real_t beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, real_t beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "her2k", ""); } -void trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - real_t alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + real_t alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { throw unimplemented("blas", "trmm", ""); } -void trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { throw unimplemented("blas", "trmm", ""); } -void trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - real_t alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + real_t alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { CALL_PORTBLAS_FN(::blas::_trsm, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +void trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { throw unimplemented("blas", "trsm", " for complex"); } -void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, +void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, real_t alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, real_t beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, real_t beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemmt", ""); } -void gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, +void gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "gemmt", ""); } -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { CALL_PORTBLAS_FN(::blas::_omatcopy, queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { +void omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { throw unimplemented("blas", "omatcopy", ""); } -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &b, std::int64_t ldb, std::int64_t strideb) { +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, + sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { CALL_PORTBLAS_FN(::blas::_omatcopy2, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, - sycl::buffer &ab, std::int64_t lda, std::int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, real_t alpha, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { throw unimplemented("blas", "imatcopy", ""); } -void imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { throw unimplemented("blas", "imatcopy", ""); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - real_t alpha, sycl::buffer &a, std::int64_t lda, real_t beta, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &c, +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + real_t alpha, sycl::buffer& a, std::int64_t lda, real_t beta, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& c, std::int64_t ldc) { CALL_PORTBLAS_FN(::blas::_omatadd, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { +void omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { throw unimplemented("blas", "omatadd", ""); } // USM APIs -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, const real_t *a, - std::int64_t lda, const real_t *b, std::int64_t ldb, real_t beta, real_t *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, real_t alpha, const real_t* a, + std::int64_t lda, const real_t* b, std::int64_t ldb, real_t beta, real_t* c, + std::int64_t ldc, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_gemm, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { if (transa == oneapi::mkl::transpose::conjtrans || transb == oneapi::mkl::transpose::conjtrans) { throw unimplemented("blas", "gemm", "Conjugate Transpose unsupported yet on portBLAS"); @@ -249,203 +249,203 @@ sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl: c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, - const real_t *b, std::int64_t ldb, real_t beta, real_t *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, real_t alpha, const real_t* a, std::int64_t lda, + const real_t* b, std::int64_t ldb, real_t beta, real_t* c, std::int64_t ldc, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_symm, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event symm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "symm", " for USM"); } -sycl::event hemm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event hemm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "hemm", " for USM"); } -sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, const real_t *a, std::int64_t lda, - real_t beta, real_t *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, const real_t* a, std::int64_t lda, + real_t beta, real_t* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "syrk", " for USM"); } -sycl::event syrk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +sycl::event syrk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "syrk", " for USM"); } -sycl::event herk(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, const std::complex *a, - std::int64_t lda, real_t beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, const std::complex* a, + std::int64_t lda, real_t beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "herk", " for USM"); } -sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, real_t alpha, const real_t *a, std::int64_t lda, - const real_t *b, std::int64_t ldb, real_t beta, real_t *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, real_t alpha, const real_t* a, std::int64_t lda, + const real_t* b, std::int64_t ldb, real_t beta, real_t* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "syr2k", " for USM"); } -sycl::event syr2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +sycl::event syr2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "syr2k", " for USM"); } -sycl::event her2k(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, +sycl::event her2k(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, real_t beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, real_t beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "her2k", " for USM"); } -sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, real_t alpha, const real_t* a, std::int64_t lda, real_t* b, + std::int64_t ldb, const std::vector& dependencies) { throw unimplemented("blas", "trmm", " for USM"); } -sycl::event trmm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event trmm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "trmm", " for USM"); } -sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t *b, - std::int64_t ldb, const std::vector &dependencies) { + std::int64_t n, real_t alpha, const real_t* a, std::int64_t lda, real_t* b, + std::int64_t ldb, const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_trsm, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, +sycl::event trsm(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "trsm", " for USM"); } -sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, +sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, real_t alpha, - const real_t *a, std::int64_t lda, const real_t *b, std::int64_t ldb, real_t beta, - real_t *c, std::int64_t ldc, const std::vector &dependencies) { + const real_t* a, std::int64_t lda, const real_t* b, std::int64_t ldb, real_t beta, + real_t* c, std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "gemmt", " for USM"); } -sycl::event gemmt(sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, +sycl::event gemmt(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", " for USM"); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - real_t alpha, const real_t *a, std::int64_t lda, real_t *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + real_t alpha, const real_t* a, std::int64_t lda, real_t* b, std::int64_t ldb, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatcopy, queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy", "for USM"); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - real_t alpha, const real_t *a, std::int64_t lda, std::int64_t stridea, - real_t *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + real_t alpha, const real_t* a, std::int64_t lda, std::int64_t stridea, + real_t* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatcopy2, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, std::int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", "for USM"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - real_t alpha, real_t *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + real_t alpha, real_t* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", ""); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, std::int64_t lda, - std::int64_t ldb, const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, std::complex* ab, std::int64_t lda, + std::int64_t ldb, const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", ""); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, real_t alpha, const real_t *a, std::int64_t lda, real_t beta, - const real_t *b, std::int64_t ldb, real_t *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, real_t alpha, const real_t* a, std::int64_t lda, real_t beta, + const real_t* b, std::int64_t ldb, real_t* c, std::int64_t ldc, + const std::vector& dependencies) { CALL_PORTBLAS_USM_FN(::blas::_omatadd, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, const std::complex *b, - std::int64_t ldb, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, const std::complex* b, + std::int64_t ldb, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "omatadd", ""); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - real_t *alpha, const real_t **a, int64_t *lda, real_t **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + real_t* alpha, const real_t** a, int64_t* lda, real_t** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", ""); } -sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, const std::complex **a, - int64_t *lda, std::complex **b, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, const std::complex** a, + int64_t* lda, std::complex** b, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "omatcopy_batch", ""); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - real_t *alpha, real_t **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + real_t* alpha, real_t** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", ""); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* groupsize, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", ""); } diff --git a/src/blas/backends/portblas/portblas_level3_bfloat16.cpp b/src/blas/backends/portblas/portblas_level3_bfloat16.cpp index 1684b1b3e..cb5bac88f 100644 --- a/src/blas/backends/portblas/portblas_level3_bfloat16.cpp +++ b/src/blas/backends/portblas/portblas_level3_bfloat16.cpp @@ -33,20 +33,20 @@ namespace portblas { namespace column_major { // BUFFER -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " for bfloat16"); } // USM -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const oneapi::mkl::bfloat16 *a, std::int64_t lda, const oneapi::mkl::bfloat16 *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + const oneapi::mkl::bfloat16* a, std::int64_t lda, const oneapi::mkl::bfloat16* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } @@ -54,20 +54,20 @@ sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl: namespace row_major { // BUFFER -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " for bfloat16"); } // USM -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const oneapi::mkl::bfloat16 *a, std::int64_t lda, const oneapi::mkl::bfloat16 *b, - std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { + const oneapi::mkl::bfloat16* a, std::int64_t lda, const oneapi::mkl::bfloat16* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } diff --git a/src/blas/backends/portblas/portblas_level3_half.cpp b/src/blas/backends/portblas/portblas_level3_half.cpp index 0e42528fa..136178998 100644 --- a/src/blas/backends/portblas/portblas_level3_half.cpp +++ b/src/blas/backends/portblas/portblas_level3_half.cpp @@ -33,66 +33,66 @@ namespace portblas { namespace column_major { // BUFFER -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " half"); } -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " for different argument data types"); } // USM -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } } // namespace column_major namespace row_major { // BUFFER -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " half"); } -void gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +void gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { throw unimplemented("blas", "gemm", " for different argument data types"); } // USM -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } -sycl::event gemm(sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { throw unimplemented("blas", "gemm", " for USM"); } diff --git a/src/blas/backends/rocblas/rocblas_batch.cpp b/src/blas/backends/rocblas/rocblas_batch.cpp index 5fa103055..64c89017f 100644 --- a/src/blas/backends/rocblas/rocblas_batch.cpp +++ b/src/blas/backends/rocblas/rocblas_batch.cpp @@ -28,7 +28,7 @@ // Helper Functions template -static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf, const int64_t len, +static inline void conj_vector(sycl::handler& cgh, sycl::buffer& buf, const int64_t len, const int64_t inc, const int64_t stride, const int64_t batch_size) { const auto abs_inc = std::abs(inc); const auto abs_stride = std::abs(stride); @@ -40,7 +40,7 @@ static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf, const i }); } template -static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, const int64_t inc, +static inline void conj_vector(sycl::handler& cgh, T* ptr, const int64_t len, const int64_t inc, const int64_t stride, const int64_t batch_size) { const auto abs_inc = std::abs(inc); const auto abs_stride = std::abs(stride); @@ -52,7 +52,7 @@ static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, co } template -static inline void conj_vector(sycl::handler &cgh, T **ptr, const int64_t len, const int64_t inc, +static inline void conj_vector(sycl::handler& cgh, T** ptr, const int64_t len, const int64_t inc, const int64_t stride, const int64_t group_size) { const auto abs_inc = std::abs(inc); cgh.parallel_for(sycl::range{ (std::size_t)group_size, (std::size_t)len }, @@ -72,20 +72,20 @@ namespace column_major { // Buffer APIs template -inline void copy_batch(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +inline void copy_batch(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy, stridex, stridey, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, stridex, y_, incy, stridey, batch_size); @@ -94,8 +94,8 @@ inline void copy_batch(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, \ + void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, \ int64_t batch_size) { \ copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, batch_size); \ } @@ -108,30 +108,30 @@ COPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zcopy_strided_batched) #undef COPY_STRIDED_BATCH_LAUNCHER template -inline void axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +inline void axpy_batch(Func func, sycl::queue& queue, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy, stridex, stridey, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, stridex, + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType*)&alpha, x_, incx, stridex, y_, incy, stridey, batch_size); }); }); } #define AXPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, \ + void axpy_batch(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, \ int64_t stridey, int64_t batch_size) { \ axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey, \ batch_size); \ @@ -145,36 +145,36 @@ AXPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zaxpy_strided_batched) #undef AXPY_BATCH_LAUNCHER template -inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, T beta, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { +inline void gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, T beta, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, incx, incy, stridea, stridex, stridey, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, - (rocDataType *)&alpha, a_, lda, stridea, x_, incx, stridex, - (rocDataType *)&beta, y_, incy, stridey, batch_size); + (rocDataType*)&alpha, a_, lda, stridea, x_, incx, stridex, + (rocDataType*)&beta, y_, incy, stridey, batch_size); }); }); } #define GEMV_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &x, int64_t incx, int64_t stridex, TYPE beta, \ - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { \ + void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& x, int64_t incx, int64_t stridex, TYPE beta, \ + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { \ gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, \ beta, y, incy, stridey, batch_size); \ } @@ -187,23 +187,23 @@ GEMV_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgemv_strided_batched) #undef GEMV_STRIDED_BATCH_LAUNCHER template -inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &c, int64_t ldc, +inline void dgmm_batch(Func func, sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldc, incx, stridea, stridex, stridec, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), m, n, a_, lda, stridea, x_, incx, stridex, c_, ldc, stridec, batch_size); @@ -212,10 +212,10 @@ inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m } #define DGMM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &x, int64_t incx, int64_t stridex, \ - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ + void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& x, int64_t incx, int64_t stridex, \ + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, \ ldc, stridec, batch_size); \ } @@ -228,10 +228,10 @@ DGMM_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zdgmm_strided_batched) #undef DGMM_STRIDED_BATCH_LAUNCHER template -inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, Ts alpha, sycl::buffer &a, int64_t lda, - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, - Ts beta, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void gemm_batch_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, Ts alpha, sycl::buffer& a, int64_t lda, + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, + Ts beta, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { using rocTypeA = typename RocEquivalentType::Type; using rocTypeB = typename RocEquivalentType::Type; @@ -241,16 +241,16 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran int32_t solution_index = 0; rocblas_gemm_flags flags = rocblas_gemm_flags_none; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(rocblas_gemm_strided_batched_ex, err, handle, @@ -266,10 +266,10 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran } #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, \ beta, c, ldc, stridec, batch_size); \ @@ -287,10 +287,10 @@ GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float) #undef GEMM_STRIDED_BATCH_LAUNCHER #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ @@ -304,35 +304,35 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER template -inline void trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower, +inline void trsm_batch(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &b, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, stridea, strideb, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, stridea, b_, ldb, strideb, + m, n, (rocDataType*)&alpha, a_, lda, stridea, b_, ldb, strideb, batch_size); }); }); } #define TRSM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, int64_t stridea, sycl::buffer &b, int64_t ldb, \ + void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, int64_t stridea, sycl::buffer& b, int64_t ldb, \ int64_t strideb, int64_t batch_size) { \ trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \ a, lda, stridea, b, ldb, strideb, batch_size); \ @@ -346,34 +346,34 @@ TRSM_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_ztrsm_strided_batched) #undef TRSM_STRIDED_BATCH_LAUNCHER template -inline void syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - T beta, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void syrk_batch(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + T beta, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc, stridea, stridec, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, stridea, (rocDataType *)&beta, c_, ldc, stridec, + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, stridea, (rocDataType*)&beta, c_, ldc, stridec, batch_size); }); }); } #define SYRK_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, int64_t stridea, TYPE beta, \ - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ + void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, int64_t stridea, TYPE beta, \ + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, stridea, beta, \ c, ldc, stridec, batch_size); \ } @@ -386,9 +386,9 @@ SYRK_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zsyrk_strided_batched) #undef SYRK_STRIDED_BATCH_LAUNCHER template -inline void omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &b, int64_t ldb, int64_t strideb, +inline void omatcopy_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& b, int64_t ldb, int64_t strideb, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, stridea, strideb, batch_size); @@ -397,27 +397,27 @@ inline void omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64 const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n; const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), get_rocblas_operation(trans), new_m, new_n, - (rocDataType *)&alpha, a_, lda, stridea, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, stridea, (rocDataType*)&beta, nullptr, lda, stridea, b_, ldb, strideb, batch_size); }); }); } #define OMATCOPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb, \ + void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb, \ int64_t batch_size) { \ omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \ strideb, batch_size); \ @@ -430,63 +430,63 @@ OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgeam_strided_batc #undef OMATCOPY_STRIDED_BATCH_LAUNCHER -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } template -inline void omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, sycl::buffer &a, int64_t lda, - int64_t stridea, const T beta, sycl::buffer &b, int64_t ldb, - int64_t strideb, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void omatadd_batch(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, sycl::buffer& a, int64_t lda, + int64_t stridea, const T beta, sycl::buffer& b, int64_t ldb, + int64_t strideb, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc, stridea, strideb, stridec, batch_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_, - lda, stridea, (rocDataType *)&beta, b_, ldb, strideb, c_, ldc, + get_rocblas_operation(transb), m, n, (rocDataType*)&alpha, a_, + lda, stridea, (rocDataType*)&beta, b_, ldb, strideb, c_, ldc, stridec, batch_size); }); }); } #define OMATADD_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, const TYPE beta, sycl::buffer &b, int64_t ldb, \ - int64_t strideb, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, const TYPE beta, sycl::buffer& b, int64_t ldb, \ + int64_t strideb, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, beta, \ b, ldb, strideb, c, ldc, stridec, batch_size); \ @@ -502,24 +502,24 @@ OMATADD_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgeam_strided_batch // USM APIs template -inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t *n, const T **x, int64_t *incx, - T **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event copy_batch(Func func, sycl::queue& queue, int64_t* n, const T** x, int64_t* incx, + T** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(n[i], incx[i], incy[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **x_ = reinterpret_cast(x); - auto **y_ = reinterpret_cast(y); + auto** x_ = reinterpret_cast(x); + auto** y_ = reinterpret_cast(y); ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, (int)n[i], x_ + offset, (int)incx[i], y_ + offset, (int)incy[i], (int)group_size[i]); offset += group_size[i]; @@ -531,9 +531,9 @@ inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t *n, const T } #define COPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy_batch(sycl::queue &queue, int64_t *n, const TYPE **x, int64_t *incx, \ - TYPE **y, int64_t *incy, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event copy_batch(sycl::queue& queue, int64_t* n, const TYPE** x, int64_t* incx, \ + TYPE** y, int64_t* incy, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, group_count, group_size, \ dependencies); \ } @@ -546,19 +546,19 @@ COPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zcopy_batched) #undef COPY_BATCH_LAUNCHER_USM template -inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, - int64_t stridex, T *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event copy_batch(Func func, sycl::queue& queue, int64_t n, const T* x, int64_t incx, + int64_t stridex, T* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy, stridex, stridey, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, stridex, y_, incy, stridey, batch_size); @@ -569,9 +569,9 @@ inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t n, const T } #define COPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy_batch(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, \ - int64_t stridex, TYPE *y, int64_t incy, int64_t stridey, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event copy_batch(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, \ + int64_t stridex, TYPE* y, int64_t incy, int64_t stridey, \ + int64_t batch_size, const std::vector& dependencies) { \ return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, \ batch_size, dependencies); \ } @@ -584,25 +584,25 @@ COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zcopy_strided_batc #undef COPY_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t *n, T *alpha, const T **x, - int64_t *incx, T **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event axpy_batch(Func func, sycl::queue& queue, int64_t* n, T* alpha, const T** x, + int64_t* incx, T** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(n[i], incx[i], incy[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **x_ = reinterpret_cast(x); - auto **y_ = reinterpret_cast(y); - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, (int)n[i], (rocDataType *)&alpha[i], + auto** x_ = reinterpret_cast(x); + auto** y_ = reinterpret_cast(y); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, (int)n[i], (rocDataType*)&alpha[i], x_ + offset, (int)incx[i], y_ + offset, (int)incy[i], (int)group_size[i]); offset += group_size[i]; @@ -614,9 +614,9 @@ inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t *n, T *alph } #define AXPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy_batch(sycl::queue &queue, int64_t *n, TYPE *alpha, const TYPE **x, \ - int64_t *incx, TYPE **y, int64_t *incy, int64_t group_count, \ - int64_t *group_size, const std::vector &dependencies) { \ + sycl::event axpy_batch(sycl::queue& queue, int64_t* n, TYPE* alpha, const TYPE** x, \ + int64_t* incx, TYPE** y, int64_t* incy, int64_t group_count, \ + int64_t* group_size, const std::vector& dependencies) { \ return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, group_count, \ group_size, dependencies); \ } @@ -629,21 +629,21 @@ AXPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zaxpy_batched) #undef AXPY_BATCH_LAUNCHER_USM template -inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, - int64_t incx, int64_t stridex, T *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event axpy_batch(Func func, sycl::queue& queue, int64_t n, T alpha, const T* x, + int64_t incx, int64_t stridex, T* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy, stridex, stridey, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, stridex, + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType*)&alpha, x_, incx, stridex, y_, incy, stridey, batch_size); }); }); @@ -652,9 +652,9 @@ inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, } #define AXPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - int64_t stridex, TYPE *y, int64_t incy, int64_t stridey, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event axpy_batch(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + int64_t stridex, TYPE* y, int64_t incy, int64_t stridey, \ + int64_t batch_size, const std::vector& dependencies) { \ return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey, \ batch_size, dependencies); \ } @@ -667,26 +667,26 @@ AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zaxpy_strided_batc #undef AXPY_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, int64_t stridea, const T *x, - int64_t incx, int64_t stridex, T beta, T *y, int64_t incy, +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, const T* a, int64_t lda, int64_t stridea, const T* x, + int64_t incx, int64_t stridex, T beta, T* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, incx, incy, stridea, stridex, stridey, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, - (rocDataType *)&alpha, a_, lda, stridea, x_, incx, stridex, - (rocDataType *)&beta, y_, incy, stridey, batch_size); + (rocDataType*)&alpha, a_, lda, stridea, x_, incx, stridex, + (rocDataType*)&beta, y_, incy, stridey, batch_size); }); }); @@ -694,11 +694,11 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, in } #define GEMV_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x, \ - int64_t incx, int64_t stridex, TYPE beta, TYPE *y, int64_t incy, \ + sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, const TYPE* x, \ + int64_t incx, int64_t stridex, TYPE beta, TYPE* y, int64_t incy, \ int64_t stridey, int64_t batch_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, \ stridex, beta, y, incy, stridey, batch_size, dependencies); \ } @@ -711,30 +711,30 @@ GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgemv_strided_batc #undef GEMV_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m, - int64_t *n, T *alpha, const T **a, int64_t *lda, const T **x, - int64_t *incx, T *beta, T **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose* trans, int64_t* m, + int64_t* n, T* alpha, const T** a, int64_t* lda, const T** x, + int64_t* incx, T* beta, T** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(m[i], n[i], lda[i], incx[i], incy[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **x_ = reinterpret_cast(x); - auto **y_ = reinterpret_cast(y); + auto** a_ = reinterpret_cast(a); + auto** x_ = reinterpret_cast(x); + auto** y_ = reinterpret_cast(y); ROCBLAS_ERROR_FUNC_SYNC( func, err, handle, get_rocblas_operation(trans[i]), (int)m[i], (int)n[i], - (rocDataType *)&alpha[i], a_ + offset, (int)lda[i], x_ + offset, (int)incx[i], - (rocDataType *)&beta[i], y_ + offset, (int)incy[i], (int)group_size[i]); + (rocDataType*)&alpha[i], a_ + offset, (int)lda[i], x_ + offset, (int)incx[i], + (rocDataType*)&beta[i], y_ + offset, (int)incy[i], (int)group_size[i]); offset += group_size[i]; } }); @@ -745,9 +745,9 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, i #define GEMV_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ sycl::event gemv_batch( \ - sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \ - int64_t *lda, const TYPE **x, int64_t *incx, TYPE *beta, TYPE **y, int64_t *incy, \ - int64_t group_count, int64_t *group_size, const std::vector &dependencies) { \ + sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, TYPE* alpha, const TYPE** a, \ + int64_t* lda, const TYPE** x, int64_t* incx, TYPE* beta, TYPE** y, int64_t* incy, \ + int64_t group_count, int64_t* group_size, const std::vector& dependencies) { \ return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \ incy, group_count, group_size, dependencies); \ } @@ -760,21 +760,21 @@ GEMV_BATCH_LAUNCHER_USM(std::complex, rocblas_zgemv_batched) #undef GEMV_BATCH_LAUNCHER_USM template -inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n, - const T *a, int64_t lda, int64_t stridea, const T *x, int64_t incx, - int64_t stridex, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event dgmm_batch(Func func, sycl::queue& queue, side left_right, int64_t m, int64_t n, + const T* a, int64_t lda, int64_t stridea, const T* x, int64_t incx, + int64_t stridex, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, incx, stridea, stridex, stridec, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), m, n, a_, lda, stridea, x_, incx, stridex, c_, ldc, stridec, batch_size); @@ -785,10 +785,10 @@ inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, in } #define DGMM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, \ - const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x, \ - int64_t incx, int64_t stridex, TYPE *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, \ + const TYPE* a, int64_t lda, int64_t stridea, const TYPE* x, \ + int64_t incx, int64_t stridex, TYPE* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, \ stridex, c, ldc, stridec, batch_size, dependencies); \ } @@ -801,26 +801,26 @@ DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zdgmm_strided_batc #undef DGMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, int64_t *m, - int64_t *n, const T **a, int64_t *lda, const T **x, int64_t *incx, - T **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event dgmm_batch(Func func, sycl::queue& queue, side* left_right, int64_t* m, + int64_t* n, const T** a, int64_t* lda, const T** x, int64_t* incx, + T** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(m[i], n[i], lda[i], ldc[i], incx[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **x_ = reinterpret_cast(x); - auto **c_ = reinterpret_cast(c); + auto** a_ = reinterpret_cast(a); + auto** x_ = reinterpret_cast(x); + auto** c_ = reinterpret_cast(c); ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right[i]), (int)m[i], (int)n[i], a_ + offset, (int)lda[i], x_ + offset, (int)incx[i], c_ + offset, (int)ldc[i], (int)group_size[i]); @@ -833,10 +833,10 @@ inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, i } #define DGMM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, \ - const TYPE **a, int64_t *lda, const TYPE **x, int64_t *incx, TYPE **c, \ - int64_t *ldc, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, \ + const TYPE** a, int64_t* lda, const TYPE** x, int64_t* incx, TYPE** c, \ + int64_t* ldc, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, x, incx, c, ldc, \ group_count, group_size, dependencies); \ } @@ -849,13 +849,13 @@ DGMM_BATCH_LAUNCHER_USM(std::complex, rocblas_zdgmm_batched) #undef DGMM_BATCH_LAUNCHER template -inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa, +inline sycl::event gemm_batch_strided_usm_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - Ts alpha, const Ta *a, int64_t lda, int64_t stridea, - const Tb *b, int64_t ldb, int64_t strideb, Ts beta, - Tc *c, int64_t ldc, int64_t stridec, + Ts alpha, const Ta* a, int64_t lda, int64_t stridea, + const Tb* b, int64_t ldb, int64_t strideb, Ts beta, + Tc* c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocTypeA = typename RocEquivalentType::Type; using rocTypeB = typename RocEquivalentType::Type; using rocTypeC = typename RocEquivalentType::Type; @@ -864,14 +864,14 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra int32_t solution_index = 0; rocblas_gemm_flags flags = rocblas_gemm_flags_none; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(rocblas_gemm_strided_batched_ex, err, handle, get_rocblas_operation(transa), get_rocblas_operation(transb), m, @@ -888,11 +888,11 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra } #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stridea, const TYPE_B* b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, \ b, ldb, strideb, beta, c, ldc, stridec, batch_size, \ dependencies); \ @@ -910,11 +910,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stridea, const TYPE_B* b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -927,11 +927,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb, - int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a, - int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, transpose* transb, + int64_t* m, int64_t* n, int64_t* k, Ts* alpha, const Ta** a, + int64_t* lda, const Tb** b, int64_t* ldb, Ts* beta, Tc** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using rocTypeA = typename RocEquivalentType::Type; using rocTypeB = typename RocEquivalentType::Type; using rocTypeC = typename RocEquivalentType::Type; @@ -942,17 +942,17 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr int32_t solution_index = 0; rocblas_gemm_flags flags = rocblas_gemm_flags_none; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **b_ = reinterpret_cast(b); - auto **c_ = reinterpret_cast(c); + auto** a_ = reinterpret_cast(a); + auto** b_ = reinterpret_cast(b); + auto** c_ = reinterpret_cast(c); ROCBLAS_ERROR_FUNC_SYNC( rocblas_gemm_batched_ex, err, handle, get_rocblas_operation(transa[i]), get_rocblas_operation(transb[i]), (int)m[i], (int)n[i], (int)k[i], &alpha[i], @@ -971,11 +971,11 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr } #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc, group_count, group_size, dependencies); \ } @@ -992,11 +992,11 @@ GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) #undef GEMM_BATCH_LAUNCHER_USM #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -1009,26 +1009,26 @@ GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_BATCH_LAUNCHER_USM template -inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower, +inline sycl::event trsm_batch(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - const T *a, int64_t lda, int64_t stridea, T *b, int64_t ldb, + const T* a, int64_t lda, int64_t stridea, T* b, int64_t ldb, int64_t strideb, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, stridea, strideb, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, stridea, b_, ldb, strideb, + m, n, (rocDataType*)&alpha, a_, lda, stridea, b_, ldb, strideb, batch_size); }); }); @@ -1037,10 +1037,10 @@ inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, up } #define TRSM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, int64_t strideb, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, int64_t strideb, \ + int64_t batch_size, const std::vector& dependencies) { \ return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, stridea, b, ldb, strideb, batch_size, dependencies); \ } @@ -1053,30 +1053,30 @@ TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_ztrsm_strided_batc #undef TRSM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, T *alpha, - const T **a, int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event trsm_batch(Func func, sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, T* alpha, + const T** a, int64_t* lda, T** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **b_ = reinterpret_cast(b); + auto** a_ = reinterpret_cast(a); + auto** b_ = reinterpret_cast(b); ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right[i]), get_rocblas_fill_mode(upper_lower[i]), get_rocblas_operation(trans[i]), get_rocblas_diag_type(unit_diag[i]), (int)m[i], (int)n[i], - (rocDataType *)&alpha[i], a_ + offset, (int)lda[i], + (rocDataType*)&alpha[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i], (int)group_size[i]); offset += group_size[i]; } @@ -1087,11 +1087,11 @@ inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, u } #define TRSM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, \ + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, TYPE* alpha, \ + const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, group_count, group_size, dependencies); \ } @@ -1104,29 +1104,29 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, rocblas_ztrsm_batched) #undef TRSM_BATCH_LAUNCHER_USM template -inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, transpose *trans, - int64_t *n, int64_t *k, T *alpha, const T **a, int64_t *lda, T *beta, - T **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event syrk_batch(Func func, sycl::queue& queue, uplo* upper_lower, transpose* trans, + int64_t* n, int64_t* k, T* alpha, const T** a, int64_t* lda, T* beta, + T** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(n[i], k[i], lda[i], ldc[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **c_ = reinterpret_cast(c); + auto** a_ = reinterpret_cast(a); + auto** c_ = reinterpret_cast(c); ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower[i]), get_rocblas_operation(trans[i]), (int)n[i], (int)k[i], - (rocDataType *)&alpha[i], a_ + offset, (int)lda[i], - (rocDataType *)&beta[i], c_ + offset, (int)ldc[i], + (rocDataType*)&alpha[i], a_ + offset, (int)lda[i], + (rocDataType*)&beta[i], c_ + offset, (int)ldc[i], (int)group_size[i]); offset += group_size[i]; } @@ -1137,10 +1137,10 @@ inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, } #define SYRK_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, \ - int64_t *k, TYPE *alpha, const TYPE **a, int64_t *lda, TYPE *beta, \ - TYPE **c, int64_t *ldc, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, \ + int64_t* k, TYPE* alpha, const TYPE** a, int64_t* lda, TYPE* beta, \ + TYPE** c, int64_t* ldc, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, \ c, ldc, group_count, group_size, dependencies); \ } @@ -1153,24 +1153,24 @@ SYRK_BATCH_LAUNCHER_USM(std::complex, rocblas_zsyrk_batched) #undef SYRK_BATCH_LAUNCHER_USM template -inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, const T alpha, const T *a, int64_t lda, - int64_t stridea, const T beta, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event syrk_batch(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, const T alpha, const T* a, int64_t lda, + int64_t stridea, const T beta, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc, stridea, stridec, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, stridea, (rocDataType *)&beta, c_, ldc, stridec, + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, stridea, (rocDataType*)&beta, c_, ldc, stridec, batch_size); }); }); @@ -1179,11 +1179,11 @@ inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, t } #define SYRK_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ - int64_t k, const TYPE alpha, const TYPE *a, int64_t lda, \ - int64_t stridea, const TYPE beta, TYPE *c, int64_t ldc, \ + sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, const TYPE alpha, const TYPE* a, int64_t lda, \ + int64_t stridea, const TYPE beta, TYPE* c, int64_t ldc, \ int64_t stridec, int64_t batch_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ stridea, beta, c, ldc, stridec, batch_size, dependencies); \ } @@ -1196,11 +1196,11 @@ SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zsyrk_strided_batc #undef SYRK_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, const T alpha, const T *a, int64_t lda, - int64_t stridea, T *b, int64_t ldb, int64_t strideb, +inline sycl::event omatcopy_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, const T alpha, const T* a, int64_t lda, + int64_t stridea, T* b, int64_t ldb, int64_t strideb, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, stridea, strideb, batch_size); @@ -1208,17 +1208,17 @@ inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n; const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), get_rocblas_operation(trans), new_m, new_n, - (rocDataType *)&alpha, a_, lda, stridea, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, stridea, (rocDataType*)&beta, nullptr, lda, stridea, b_, ldb, strideb, batch_size); }); }); @@ -1227,10 +1227,10 @@ inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans } #define OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, const TYPE *a, int64_t lda, int64_t stridea, \ - TYPE *b, int64_t ldb, int64_t strideb, int64_t batch_size, \ - const std::vector &dependencies) { \ + sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, const TYPE* a, int64_t lda, int64_t stridea, \ + TYPE* b, int64_t ldb, int64_t strideb, int64_t batch_size, \ + const std::vector& dependencies) { \ return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \ strideb, batch_size, dependencies); \ } @@ -1242,53 +1242,53 @@ OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_strided_ #undef OMATCOPY_STRIDED_BATCH_LAUNCHER_USM -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } template -inline sycl::event omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, const T *a, int64_t lda, - int64_t stridea, const T beta, const T *b, int64_t ldb, - int64_t strideb, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event omatadd_batch(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, const T* a, int64_t lda, + int64_t stridea, const T beta, const T* b, int64_t ldb, + int64_t strideb, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc, stridea, strideb, stridec, batch_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_, - lda, stridea, (rocDataType *)&beta, b_, ldb, strideb, c_, ldc, + get_rocblas_operation(transb), m, n, (rocDataType*)&alpha, a_, + lda, stridea, (rocDataType*)&beta, b_, ldb, strideb, c_, ldc, stridec, batch_size); }); }); @@ -1297,11 +1297,11 @@ inline sycl::event omatadd_batch(Func func, sycl::queue &queue, transpose transa } #define OMATADD_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, \ - int64_t stridea, const TYPE beta, const TYPE *b, int64_t ldb, \ - int64_t strideb, TYPE *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, const TYPE* a, int64_t lda, \ + int64_t stridea, const TYPE beta, const TYPE* b, int64_t ldb, \ + int64_t strideb, TYPE* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, \ beta, b, ldb, strideb, c, ldc, stridec, batch_size, dependencies); \ } @@ -1314,25 +1314,25 @@ OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_strided_b #undef OMATADD_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m, - int64_t *n, T *alpha, const T **a, int64_t *lda, T **b, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event omatcopy_batch(Func func, sycl::queue& queue, transpose* trans, int64_t* m, + int64_t* n, T* alpha, const T** a, int64_t* lda, T** b, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; for (int64_t i = 0; i < group_count; i++) { overflow_check(m[i], n[i], lda[i], ldb[i], group_size[i]); } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a); - auto **b_ = reinterpret_cast(b); + auto** a_ = reinterpret_cast(a); + auto** b_ = reinterpret_cast(b); const T beta = 0; const auto new_m = trans[i] == oneapi::mkl::transpose::nontrans ? m[i] : n[i]; @@ -1340,8 +1340,8 @@ inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *tran ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans[i]), get_rocblas_operation(trans[i]), (int)new_m, (int)new_n, - (rocDataType *)&alpha[i], a_ + offset, (int)lda[i], - (rocDataType *)&beta, nullptr, (int)lda[i], b_ + offset, + (rocDataType*)&alpha[i], a_ + offset, (int)lda[i], + (rocDataType*)&beta, nullptr, (int)lda[i], b_ + offset, (int)ldb[i], (int)group_size[i]); offset += group_size[i]; } @@ -1352,10 +1352,10 @@ inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *tran } #define OMATCOPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, \ - TYPE *alpha, const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, \ + TYPE* alpha, const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, \ group_count, group_size, dependencies); \ } @@ -1367,31 +1367,31 @@ OMATCOPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_batched) #undef OMATCOPY_BATCH_LAUNCHER_USM -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, float** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, double** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for column_major layout"); } @@ -1402,15 +1402,15 @@ namespace row_major { // Buffer APIs template -inline void copy_batch(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +inline void copy_batch(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { column_major::copy_batch(func, queue, n, x, incx, stridex, y, incy, stridey, batch_size); } #define COPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void copy_batch(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - int64_t stridex, sycl::buffer &y, int64_t incy, int64_t stridey, \ + void copy_batch(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, \ int64_t batch_size) { \ copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, batch_size); \ } @@ -1423,15 +1423,15 @@ COPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zcopy_strided_batched) #undef COPY_STRIDED_BATCH_LAUNCHER template -inline void axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, +inline void axpy_batch(Func func, sycl::queue& queue, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { column_major::axpy_batch(func, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } #define AXPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, int64_t stridex, sycl::buffer &y, int64_t incy, \ + void axpy_batch(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, int64_t stridex, sycl::buffer& y, int64_t incy, \ int64_t stridey, int64_t batch_size) { \ axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey, \ batch_size); \ @@ -1445,10 +1445,10 @@ AXPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zaxpy_strided_batched) #undef AXPY_STRIDED_BATCH_LAUNCHER template -inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - int64_t stridea, sycl::buffer, 1> &x, int64_t incx, - int64_t stridex, std::complex beta, sycl::buffer, 1> &y, +inline void gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + int64_t stridea, sycl::buffer, 1>& x, int64_t incx, + int64_t stridex, std::complex beta, sycl::buffer, 1>& y, int64_t incy, int64_t stridey, int64_t batch_size) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1459,11 +1459,11 @@ inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m if (m > 0) { queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx, stridex, batch_size); }); + [&](sycl::handler& cgh) { conj_vector(cgh, x, m, incx, stridex, batch_size); }); if (n > 0) { queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); + [&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); } } } @@ -1474,16 +1474,16 @@ inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); + [&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); } } } template -inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &x, int64_t incx, int64_t stridex, T beta, - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { +inline void gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& x, int64_t incx, int64_t stridex, T beta, + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1492,10 +1492,10 @@ inline void gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m } #define GEMV_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &x, int64_t incx, int64_t stridex, TYPE beta, \ - sycl::buffer &y, int64_t incy, int64_t stridey, int64_t batch_size) { \ + void gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& x, int64_t incx, int64_t stridex, TYPE beta, \ + sycl::buffer& y, int64_t incy, int64_t stridey, int64_t batch_size) { \ gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, \ beta, y, incy, stridey, batch_size); \ } @@ -1508,9 +1508,9 @@ GEMV_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgemv_strided_batched) #undef GEMV_STRIDED_BATCH_LAUNCHER template -inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &x, - int64_t incx, int64_t stridex, sycl::buffer &c, int64_t ldc, +inline void dgmm_batch(Func func, sycl::queue& queue, side left_right, int64_t m, int64_t n, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& x, + int64_t incx, int64_t stridex, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; @@ -1520,10 +1520,10 @@ inline void dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m } #define DGMM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &x, int64_t incx, int64_t stridex, \ - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ + void dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& x, int64_t incx, int64_t stridex, \ + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, \ ldc, stridec, batch_size); \ } @@ -1536,10 +1536,10 @@ DGMM_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zdgmm_strided_batched) #undef DGMM_STRIDED_BATCH_LAUNCHER template -inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, Ts alpha, sycl::buffer &a, int64_t lda, - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, - Ts beta, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void gemm_batch_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, Ts alpha, sycl::buffer& a, int64_t lda, + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, + Ts beta, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { auto new_transa = transb; auto new_transb = transa; @@ -1550,10 +1550,10 @@ inline void gemm_batch_impl(sycl::queue &queue, transpose transa, transpose tran #undef GEMM_STRIDED_BATCH_LAUNCHER #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ gemm_batch_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, \ beta, c, ldc, stridec, batch_size); \ @@ -1571,10 +1571,10 @@ GEMM_STRIDED_BATCH_LAUNCHER(sycl::half, sycl::half, float, float) #undef GEMM_STRIDED_BATCH_LAUNCHER #define GEMM_STRIDED_BATCH_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - void gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, sycl::buffer &b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ @@ -1588,9 +1588,9 @@ GEMM_STRIDED_BATCH_LAUNCHER(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER template -inline void trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower, +inline void trsm_batch(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, int64_t stridea, sycl::buffer &b, + sycl::buffer& a, int64_t lda, int64_t stridea, sycl::buffer& b, int64_t ldb, int64_t strideb, int64_t batch_size) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; @@ -1602,9 +1602,9 @@ inline void trsm_batch(Func func, sycl::queue &queue, side left_right, uplo uppe } #define TRSM_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, int64_t stridea, sycl::buffer &b, int64_t ldb, \ + void trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, int64_t stridea, sycl::buffer& b, int64_t ldb, \ int64_t strideb, int64_t batch_size) { \ trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, \ a, lda, stridea, b, ldb, strideb, batch_size); \ @@ -1618,9 +1618,9 @@ TRSM_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_ztrsm_strided_batched) #undef TRSM_STRIDED_BATCH_LAUNCHER template -inline void syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - T beta, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void syrk_batch(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + T beta, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -1632,9 +1632,9 @@ inline void syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpos } #define SYRK_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, int64_t stridea, TYPE beta, \ - sycl::buffer &c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ + void syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, int64_t stridea, TYPE beta, \ + sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { \ syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, stridea, beta, \ c, ldc, stridec, batch_size); \ } @@ -1647,18 +1647,18 @@ SYRK_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zsyrk_strided_batched) #undef SYRK_STRIDED_BATCH_LAUNCHER template -inline void omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, sycl::buffer &a, int64_t lda, int64_t stridea, - sycl::buffer &b, int64_t ldb, int64_t strideb, +inline void omatcopy_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, sycl::buffer& a, int64_t lda, int64_t stridea, + sycl::buffer& b, int64_t ldb, int64_t strideb, int64_t batch_size) { return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, stridea, b, ldb, strideb, batch_size); } #define OMATCOPY_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb, \ + void omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb, \ int64_t batch_size) { \ omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \ strideb, batch_size); \ @@ -1671,45 +1671,45 @@ OMATCOPY_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgeam_strided_batc #undef OMATCOPY_STRIDED_BATCH_LAUNCHER -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb, int64_t stride, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -void imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } template -inline void omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, sycl::buffer &a, int64_t lda, - int64_t stridea, const T beta, sycl::buffer &b, int64_t ldb, - int64_t strideb, sycl::buffer &c, int64_t ldc, int64_t stridec, +inline void omatadd_batch(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, sycl::buffer& a, int64_t lda, + int64_t stridea, const T beta, sycl::buffer& b, int64_t ldb, + int64_t strideb, sycl::buffer& c, int64_t ldc, int64_t stridec, int64_t batch_size) { return column_major::omatadd_batch(func, queue, transa, transb, n, m, alpha, a, lda, stridea, beta, b, ldb, strideb, c, ldc, stridec, batch_size); } #define OMATADD_STRIDED_BATCH_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, sycl::buffer &a, int64_t lda, \ - int64_t stridea, const TYPE beta, sycl::buffer &b, int64_t ldb, \ - int64_t strideb, sycl::buffer &c, int64_t ldc, int64_t stridec, \ + void omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, sycl::buffer& a, int64_t lda, \ + int64_t stridea, const TYPE beta, sycl::buffer& b, int64_t ldb, \ + int64_t strideb, sycl::buffer& c, int64_t ldc, int64_t stridec, \ int64_t batch_size) { \ omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, beta, \ b, ldb, strideb, c, ldc, stridec, batch_size); \ @@ -1725,17 +1725,17 @@ OMATADD_STRIDED_BATCH_LAUNCHER(std::complex, rocblas_zgeam_strided_batch // USM APIs template -inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t *n, const T **x, int64_t *incx, - T **y, int64_t *incy, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event copy_batch(Func func, sycl::queue& queue, int64_t* n, const T** x, int64_t* incx, + T** y, int64_t* incy, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return column_major::copy_batch(func, queue, n, x, incx, y, incy, group_count, group_size, dependencies); } #define COPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy_batch(sycl::queue &queue, int64_t *n, const TYPE **x, int64_t *incx, \ - TYPE **y, int64_t *incy, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event copy_batch(sycl::queue& queue, int64_t* n, const TYPE** x, int64_t* incx, \ + TYPE** y, int64_t* incy, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, group_count, group_size, \ dependencies); \ } @@ -1748,17 +1748,17 @@ COPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zcopy_batched) #undef COPY_BATCH_LAUNCHER_USM template -inline sycl::event copy_batch(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, - int64_t stridex, T *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event copy_batch(Func func, sycl::queue& queue, int64_t n, const T* x, int64_t incx, + int64_t stridex, T* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { return column_major::copy_batch(func, queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } #define COPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy_batch(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, \ - int64_t stridex, TYPE *y, int64_t incy, int64_t stridey, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event copy_batch(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, \ + int64_t stridex, TYPE* y, int64_t incy, int64_t stridey, \ + int64_t batch_size, const std::vector& dependencies) { \ return copy_batch(ROCBLAS_ROUTINE, queue, n, x, incx, stridex, y, incy, stridey, \ batch_size, dependencies); \ } @@ -1771,17 +1771,17 @@ COPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zcopy_strided_batc #undef COPY_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t *n, T *alpha, const T **x, - int64_t *incx, T **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event axpy_batch(Func func, sycl::queue& queue, int64_t* n, T* alpha, const T** x, + int64_t* incx, T** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { return column_major::axpy_batch(func, queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } #define AXPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy_batch(sycl::queue &queue, int64_t *n, TYPE *alpha, const TYPE **x, \ - int64_t *incx, TYPE **y, int64_t *incy, int64_t group_count, \ - int64_t *group_size, const std::vector &dependencies) { \ + sycl::event axpy_batch(sycl::queue& queue, int64_t* n, TYPE* alpha, const TYPE** x, \ + int64_t* incx, TYPE** y, int64_t* incy, int64_t group_count, \ + int64_t* group_size, const std::vector& dependencies) { \ return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, group_count, \ group_size, dependencies); \ } @@ -1794,17 +1794,17 @@ AXPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zaxpy_batched) #undef AXPY_BATCH_LAUNCHER_USM template -inline sycl::event axpy_batch(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, - int64_t incx, int64_t stridex, T *y, int64_t incy, int64_t stridey, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event axpy_batch(Func func, sycl::queue& queue, int64_t n, T alpha, const T* x, + int64_t incx, int64_t stridex, T* y, int64_t incy, int64_t stridey, + int64_t batch_size, const std::vector& dependencies) { return column_major::axpy_batch(func, queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } #define AXPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy_batch(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - int64_t stridex, TYPE *y, int64_t incy, int64_t stridey, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event axpy_batch(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + int64_t stridex, TYPE* y, int64_t incy, int64_t stridey, \ + int64_t batch_size, const std::vector& dependencies) { \ return axpy_batch(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, stridex, y, incy, stridey, \ batch_size, dependencies); \ } @@ -1817,12 +1817,12 @@ AXPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zaxpy_strided_batc #undef AXPY_BATCH_LAUNCHER_USM template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - int64_t stridea, const std::complex *x, int64_t incx, - int64_t stridex, std::complex beta, std::complex *y, +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + int64_t stridea, const std::complex* x, int64_t incx, + int64_t stridex, std::complex beta, std::complex* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { sycl::event done; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -1833,13 +1833,13 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, in beta = std::conj(beta); if (m > 0) { - done = queue.submit([&](sycl::handler &cgh) { - conj_vector(cgh, (std::complex *)x, m, incx, stridex, batch_size); + done = queue.submit([&](sycl::handler& cgh) { + conj_vector(cgh, (std::complex*)x, m, incx, stridex, batch_size); }); if (n > 0) { done = queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); + [&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy, stridey, batch_size); }); } } } @@ -1851,7 +1851,7 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, in if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy, stridey, batch_size); }); @@ -1862,11 +1862,11 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, in } template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, int64_t stridea, const T *x, - int64_t incx, int64_t stridex, T beta, T *y, int64_t incy, +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, const T* a, int64_t lda, int64_t stridea, const T* x, + int64_t incx, int64_t stridex, T beta, T* y, int64_t incy, int64_t stridey, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1875,11 +1875,11 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose trans, in } #define GEMV_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemv_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x, \ - int64_t incx, int64_t stridex, TYPE beta, TYPE *y, int64_t incy, \ + sycl::event gemv_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, const TYPE* x, \ + int64_t incx, int64_t stridex, TYPE beta, TYPE* y, int64_t incy, \ int64_t stridey, int64_t batch_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, x, incx, \ stridex, beta, y, incy, stridey, batch_size, dependencies); \ } @@ -1892,12 +1892,12 @@ GEMV_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgemv_strided_batc #undef GEMV_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m, - int64_t *n, std::complex *alpha, const std::complex **a, - int64_t *lda, const std::complex **x, int64_t *incx, - std::complex *beta, std::complex **y, int64_t *incy, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose* trans, int64_t* m, + int64_t* n, std::complex* alpha, const std::complex** a, + int64_t* lda, const std::complex** x, int64_t* incx, + std::complex* beta, std::complex** y, int64_t* incy, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { sycl::event done; int64_t stride = 0; @@ -1907,12 +1907,12 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, i beta[i] = std::conj(beta[i]); if (m[i] > 0) { - done = queue.submit([&](sycl::handler &cgh) { - conj_vector(cgh, (std::complex **)x, m[i], incx[i], stride, group_size[i]); + done = queue.submit([&](sycl::handler& cgh) { + conj_vector(cgh, (std::complex**)x, m[i], incx[i], stride, group_size[i]); }); if (n[i] > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n[i], incy[i], stride, group_size[i]); }); } @@ -1942,7 +1942,7 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, i for (int64_t i = 0; i < group_count; i++) { if (trans[i] == oneapi::mkl::transpose::conjtrans) { if (n[i] > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n[i], incy[i], stride, group_size[i]); }); } @@ -1954,10 +1954,10 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, i } template -inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m, - int64_t *n, T *alpha, const T **a, int64_t *lda, const T **x, - int64_t *incx, T *beta, T **y, int64_t *incy, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event gemv_batch(Func func, sycl::queue& queue, transpose* trans, int64_t* m, + int64_t* n, T* alpha, const T** a, int64_t* lda, const T** x, + int64_t* incx, T* beta, T** y, int64_t* incy, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { auto tmp_trans = std::vector{ static_cast(group_count) }; for (int64_t i = 0; i < group_count; i++) { @@ -1979,9 +1979,9 @@ inline sycl::event gemv_batch(Func func, sycl::queue &queue, transpose *trans, i #define GEMV_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ sycl::event gemv_batch( \ - sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, TYPE *alpha, const TYPE **a, \ - int64_t *lda, const TYPE **x, int64_t *incx, TYPE *beta, TYPE **y, int64_t *incy, \ - int64_t group_count, int64_t *group_size, const std::vector &dependencies) { \ + sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, TYPE* alpha, const TYPE** a, \ + int64_t* lda, const TYPE** x, int64_t* incx, TYPE* beta, TYPE** y, int64_t* incy, \ + int64_t group_count, int64_t* group_size, const std::vector& dependencies) { \ return gemv_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, \ incy, group_count, group_size, dependencies); \ } @@ -1994,10 +1994,10 @@ GEMV_BATCH_LAUNCHER_USM(std::complex, rocblas_zgemv_batched) #undef GEMV_BATCH_LAUNCHER_USM template -inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, int64_t m, int64_t n, - const T *a, int64_t lda, int64_t stridea, const T *x, int64_t incx, - int64_t stridex, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event dgmm_batch(Func func, sycl::queue& queue, side left_right, int64_t m, int64_t n, + const T* a, int64_t lda, int64_t stridea, const T* x, int64_t incx, + int64_t stridex, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; @@ -2006,10 +2006,10 @@ inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side left_right, in } #define DGMM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event dgmm_batch(sycl::queue &queue, side left_right, int64_t m, int64_t n, \ - const TYPE *a, int64_t lda, int64_t stridea, const TYPE *x, \ - int64_t incx, int64_t stridex, TYPE *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event dgmm_batch(sycl::queue& queue, side left_right, int64_t m, int64_t n, \ + const TYPE* a, int64_t lda, int64_t stridea, const TYPE* x, \ + int64_t incx, int64_t stridex, TYPE* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, stridea, x, incx, \ stridex, c, ldc, stridec, batch_size, dependencies); \ } @@ -2022,10 +2022,10 @@ DGMM_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zdgmm_strided_batc #undef DGMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, int64_t *m, - int64_t *n, const T **a, int64_t *lda, const T **x, int64_t *incx, - T **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event dgmm_batch(Func func, sycl::queue& queue, side* left_right, int64_t* m, + int64_t* n, const T** a, int64_t* lda, const T** x, int64_t* incx, + T** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { for (int64_t i = 0; i < group_count; i++) { const auto new_side = left_right[i] == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; @@ -2037,10 +2037,10 @@ inline sycl::event dgmm_batch(Func func, sycl::queue &queue, side *left_right, i } #define DGMM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event dgmm_batch(sycl::queue &queue, side *left_right, int64_t *m, int64_t *n, \ - const TYPE **a, int64_t *lda, const TYPE **x, int64_t *incx, TYPE **c, \ - int64_t *ldc, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event dgmm_batch(sycl::queue& queue, side* left_right, int64_t* m, int64_t* n, \ + const TYPE** a, int64_t* lda, const TYPE** x, int64_t* incx, TYPE** c, \ + int64_t* ldc, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return dgmm_batch(ROCBLAS_ROUTINE, queue, left_right, m, n, a, lda, x, incx, c, ldc, \ group_count, group_size, dependencies); \ } @@ -2053,13 +2053,13 @@ DGMM_BATCH_LAUNCHER_USM(std::complex, rocblas_zdgmm_batched) #undef DGMM_BATCH_LAUNCHER template -inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose transa, +inline sycl::event gemm_batch_strided_usm_impl(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, int64_t k, - Ts alpha, const Ta *a, int64_t lda, int64_t stridea, - const Tb *b, int64_t ldb, int64_t strideb, Ts beta, - Tc *c, int64_t ldc, int64_t stridec, + Ts alpha, const Ta* a, int64_t lda, int64_t stridea, + const Tb* b, int64_t ldb, int64_t strideb, Ts beta, + Tc* c, int64_t ldc, int64_t stridec, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto new_transa = transb; auto new_transb = transa; @@ -2069,11 +2069,11 @@ inline sycl::event gemm_batch_strided_usm_impl(sycl::queue &queue, transpose tra } #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stridea, const TYPE_B* b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return gemm_batch_strided_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, stridea, \ b, ldb, strideb, beta, c, ldc, stridec, batch_size, \ dependencies); \ @@ -2091,11 +2091,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM #define GEMM_STRIDED_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, \ - int64_t stridea, const TYPE_B *b, int64_t ldb, int64_t strideb, \ - TYPE_S beta, TYPE_C *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, \ + int64_t stridea, const TYPE_B* b, int64_t ldb, int64_t strideb, \ + TYPE_S beta, TYPE_C* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -2108,11 +2108,11 @@ GEMM_STRIDED_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, transpose *transb, - int64_t *m, int64_t *n, int64_t *k, Ts *alpha, const Ta **a, - int64_t *lda, const Tb **b, int64_t *ldb, Ts *beta, Tc **c, - int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, transpose* transb, + int64_t* m, int64_t* n, int64_t* k, Ts* alpha, const Ta** a, + int64_t* lda, const Tb** b, int64_t* ldb, Ts* beta, Tc** c, + int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { for (int64_t i = 0; i < group_count; i++) { std::swap(transa[i], transb[i]); } @@ -2122,11 +2122,11 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue &queue, transpose *transa, tr } #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return gemm_batch_usm_impl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc, group_count, group_size, dependencies); \ } @@ -2143,11 +2143,11 @@ GEMM_BATCH_LAUNCHER_USM(sycl::half, sycl::half, float, float) #undef GEMM_BATCH_LAUNCHER_USM #define GEMM_BATCH_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S) \ - sycl::event gemm_batch(sycl::queue &queue, transpose *transa, transpose *transb, int64_t *m, \ - int64_t *n, int64_t *k, TYPE_S *alpha, const TYPE_A **a, int64_t *lda, \ - const TYPE_B **b, int64_t *ldb, TYPE_S *beta, TYPE_C **c, int64_t *ldc, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event gemm_batch(sycl::queue& queue, transpose* transa, transpose* transb, int64_t* m, \ + int64_t* n, int64_t* k, TYPE_S* alpha, const TYPE_A** a, int64_t* lda, \ + const TYPE_B** b, int64_t* ldb, TYPE_S* beta, TYPE_C** c, int64_t* ldc, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ throw unimplemented("blas", "gemm_batch", \ std::string("for dtype unimplemented dtype combination <") + \ dtype_string() + "," + dtype_string() + "," + \ @@ -2160,11 +2160,11 @@ GEMM_BATCH_LAUNCHER_USM(std::int8_t, std::int8_t, std::int32_t, float) #undef GEMM_BATCH_LAUNCHER_USM template -inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, uplo upper_lower, +inline sycl::event trsm_batch(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, - const T *a, int64_t lda, int64_t stridea, T *b, int64_t ldb, + const T* a, int64_t lda, int64_t stridea, T* b, int64_t ldb, int64_t strideb, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -2175,10 +2175,10 @@ inline sycl::event trsm_batch(Func func, sycl::queue &queue, side left_right, up } #define TRSM_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, int64_t strideb, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, int64_t strideb, \ + int64_t batch_size, const std::vector& dependencies) { \ return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, stridea, b, ldb, strideb, batch_size, dependencies); \ } @@ -2191,10 +2191,10 @@ TRSM_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_ztrsm_strided_batc #undef TRSM_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, uplo *upper_lower, - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, T *alpha, - const T **a, int64_t *lda, T **b, int64_t *ldb, int64_t group_count, - int64_t *group_size, const std::vector &dependencies) { +inline sycl::event trsm_batch(Func func, sycl::queue& queue, side* left_right, uplo* upper_lower, + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, T* alpha, + const T** a, int64_t* lda, T** b, int64_t* ldb, int64_t group_count, + int64_t* group_size, const std::vector& dependencies) { for (int64_t i = 0; i < group_count; i++) { const auto new_side = left_right[i] == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; @@ -2210,11 +2210,11 @@ inline sycl::event trsm_batch(Func func, sycl::queue &queue, side *left_right, u } #define TRSM_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm_batch(sycl::queue &queue, side *left_right, uplo *upper_lower, \ - transpose *trans, diag *unit_diag, int64_t *m, int64_t *n, TYPE *alpha, \ - const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event trsm_batch(sycl::queue& queue, side* left_right, uplo* upper_lower, \ + transpose* trans, diag* unit_diag, int64_t* m, int64_t* n, TYPE* alpha, \ + const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return trsm_batch(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, group_count, group_size, dependencies); \ } @@ -2227,10 +2227,10 @@ TRSM_BATCH_LAUNCHER_USM(std::complex, rocblas_ztrsm_batched) #undef TRSM_BATCH_LAUNCHER_USM template -inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, transpose *trans, - int64_t *n, int64_t *k, T *alpha, const T **a, int64_t *lda, T *beta, - T **c, int64_t *ldc, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event syrk_batch(Func func, sycl::queue& queue, uplo* upper_lower, transpose* trans, + int64_t* n, int64_t* k, T* alpha, const T** a, int64_t* lda, T* beta, + T** c, int64_t* ldc, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { for (int64_t i = 0; i < group_count; i++) { const auto new_uplo = upper_lower[i] == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2247,10 +2247,10 @@ inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo *upper_lower, } #define SYRK_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk_batch(sycl::queue &queue, uplo *upper_lower, transpose *trans, int64_t *n, \ - int64_t *k, TYPE *alpha, const TYPE **a, int64_t *lda, TYPE *beta, \ - TYPE **c, int64_t *ldc, int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event syrk_batch(sycl::queue& queue, uplo* upper_lower, transpose* trans, int64_t* n, \ + int64_t* k, TYPE* alpha, const TYPE** a, int64_t* lda, TYPE* beta, \ + TYPE** c, int64_t* ldc, int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, \ c, ldc, group_count, group_size, dependencies); \ } @@ -2263,10 +2263,10 @@ SYRK_BATCH_LAUNCHER_USM(std::complex, rocblas_zsyrk_batched) #undef SYRK_BATCH_LAUNCHER_USM template -inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, const T alpha, const T *a, int64_t lda, - int64_t stridea, const T beta, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event syrk_batch(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, const T alpha, const T* a, int64_t lda, + int64_t stridea, const T beta, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2277,11 +2277,11 @@ inline sycl::event syrk_batch(Func func, sycl::queue &queue, uplo upper_lower, t } #define SYRK_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk_batch(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, \ - int64_t k, const TYPE alpha, const TYPE *a, int64_t lda, \ - int64_t stridea, const TYPE beta, TYPE *c, int64_t ldc, \ + sycl::event syrk_batch(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, \ + int64_t k, const TYPE alpha, const TYPE* a, int64_t lda, \ + int64_t stridea, const TYPE beta, TYPE* c, int64_t ldc, \ int64_t stridec, int64_t batch_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return syrk_batch(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, \ stridea, beta, c, ldc, stridec, batch_size, dependencies); \ } @@ -2294,20 +2294,20 @@ SYRK_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zsyrk_strided_batc #undef SYRK_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, const T alpha, const T *a, int64_t lda, - int64_t stridea, T *b, int64_t ldb, int64_t strideb, +inline sycl::event omatcopy_batch(Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, const T alpha, const T* a, int64_t lda, + int64_t stridea, T* b, int64_t ldb, int64_t strideb, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, stridea, b, ldb, strideb, batch_size, dependencies); } #define OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, const TYPE *a, int64_t lda, int64_t stridea, \ - TYPE *b, int64_t ldb, int64_t strideb, int64_t batch_size, \ - const std::vector &dependencies) { \ + sycl::event omatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, const TYPE* a, int64_t lda, int64_t stridea, \ + TYPE* b, int64_t ldb, int64_t strideb, int64_t batch_size, \ + const std::vector& dependencies) { \ return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, stridea, b, ldb, \ strideb, batch_size, dependencies); \ } @@ -2319,49 +2319,49 @@ OMATCOPY_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_strided_ #undef OMATCOPY_STRIDED_BATCH_LAUNCHER_USM -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, +sycl::event imatcopy_batch(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, int64_t stride, int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } template -inline sycl::event omatadd_batch(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, const T *a, int64_t lda, - int64_t stridea, const T beta, const T *b, int64_t ldb, - int64_t strideb, T *c, int64_t ldc, int64_t stridec, - int64_t batch_size, const std::vector &dependencies) { +inline sycl::event omatadd_batch(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, const T* a, int64_t lda, + int64_t stridea, const T beta, const T* b, int64_t ldb, + int64_t strideb, T* c, int64_t ldc, int64_t stridec, + int64_t batch_size, const std::vector& dependencies) { return column_major::omatadd_batch(func, queue, transa, transb, n, m, alpha, a, lda, stridea, beta, b, ldb, strideb, c, ldc, stridec, batch_size, dependencies); } #define OMATADD_STRIDED_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatadd_batch(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, \ - int64_t stridea, const TYPE beta, const TYPE *b, int64_t ldb, \ - int64_t strideb, TYPE *c, int64_t ldc, int64_t stridec, \ - int64_t batch_size, const std::vector &dependencies) { \ + sycl::event omatadd_batch(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, const TYPE* a, int64_t lda, \ + int64_t stridea, const TYPE beta, const TYPE* b, int64_t ldb, \ + int64_t strideb, TYPE* c, int64_t ldc, int64_t stridec, \ + int64_t batch_size, const std::vector& dependencies) { \ return omatadd_batch(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, stridea, \ beta, b, ldb, strideb, c, ldc, stridec, batch_size, dependencies); \ } @@ -2374,19 +2374,19 @@ OMATADD_STRIDED_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_strided_b #undef OMATADD_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event omatcopy_batch(Func func, sycl::queue &queue, transpose *trans, int64_t *m, - int64_t *n, T *alpha, const T **a, int64_t *lda, T **b, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +inline sycl::event omatcopy_batch(Func func, sycl::queue& queue, transpose* trans, int64_t* m, + int64_t* n, T* alpha, const T** a, int64_t* lda, T** b, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { return column_major::omatcopy_batch(func, queue, trans, n, m, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } #define OMATCOPY_BATCH_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, \ - TYPE *alpha, const TYPE **a, int64_t *lda, TYPE **b, int64_t *ldb, \ - int64_t group_count, int64_t *group_size, \ - const std::vector &dependencies) { \ + sycl::event omatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, \ + TYPE* alpha, const TYPE** a, int64_t* lda, TYPE** b, int64_t* ldb, \ + int64_t group_count, int64_t* group_size, \ + const std::vector& dependencies) { \ return omatcopy_batch(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, \ group_count, group_size, dependencies); \ } @@ -2398,31 +2398,31 @@ OMATCOPY_BATCH_LAUNCHER_USM(std::complex, rocblas_zgeam_batched) #undef OMATCOPY_BATCH_LAUNCHER_USM -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - float *alpha, float **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + float* alpha, float** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - double *alpha, double **ab, int64_t *lda, int64_t *ldb, - int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + double* alpha, double** ab, int64_t* lda, int64_t* ldb, + int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } -sycl::event imatcopy_batch(sycl::queue &queue, transpose *trans, int64_t *m, int64_t *n, - std::complex *alpha, std::complex **ab, int64_t *lda, - int64_t *ldb, int64_t group_count, int64_t *group_size, - const std::vector &dependencies) { +sycl::event imatcopy_batch(sycl::queue& queue, transpose* trans, int64_t* m, int64_t* n, + std::complex* alpha, std::complex** ab, int64_t* lda, + int64_t* ldb, int64_t group_count, int64_t* group_size, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy_batch", "for row_major layout"); } diff --git a/src/blas/backends/rocblas/rocblas_extensions.cpp b/src/blas/backends/rocblas/rocblas_extensions.cpp index a1fd1df1c..5fa5b61aa 100644 --- a/src/blas/backends/rocblas/rocblas_extensions.cpp +++ b/src/blas/backends/rocblas/rocblas_extensions.cpp @@ -33,65 +33,65 @@ namespace column_major { // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for column_major layout"); } template -inline void omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, +inline void omatcopy(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); @@ -100,26 +100,26 @@ inline void omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n; const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), get_rocblas_operation(trans), new_m, new_n, - (rocDataType *)&alpha, a_, lda, (rocDataType *)&beta, nullptr, + (rocDataType*)&alpha, a_, lda, (rocDataType*)&beta, nullptr, lda, b_, ldb); }); }); } #define OMATCOPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, const TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, const TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { \ omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \ } @@ -131,16 +131,16 @@ OMATCOPY_LAUNCHER(std::complex, rocblas_zgeam) #undef OMATCOPY_LAUNCHER template -void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb) { \ + void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb) { \ omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, \ b, ldb, strideb); \ } @@ -152,55 +152,55 @@ OMATCOPY2_LAUNCHER(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } template -inline void omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, const T alpha, sycl::buffer &a, int64_t lda, const T beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +inline void omatadd(Func func, sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, const T alpha, sycl::buffer& a, int64_t lda, const T beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_, - lda, (rocDataType *)&beta, b_, ldb, c_, ldc); + get_rocblas_operation(transb), m, n, (rocDataType*)&alpha, a_, + lda, (rocDataType*)&beta, b_, ldb, c_, ldc); }); }); } #define OMATADD_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - const TYPE alpha, sycl::buffer &a, int64_t lda, const TYPE beta, \ - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { \ + void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + const TYPE alpha, sycl::buffer& a, int64_t lda, const TYPE beta, \ + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { \ omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, \ ldc); \ } @@ -214,72 +214,72 @@ OMATADD_LAUNCHER(std::complex, rocblas_zgeam) // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for column_major layout"); } template -inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event omatcopy(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); @@ -287,17 +287,17 @@ inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int6 const int64_t new_m = trans == oneapi::mkl::transpose::nontrans ? m : n; const int64_t new_n = trans == oneapi::mkl::transpose::nontrans ? n : m; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), get_rocblas_operation(trans), new_m, new_n, - (rocDataType *)&alpha, a_, lda, (rocDataType *)&beta, nullptr, + (rocDataType*)&alpha, a_, lda, (rocDataType*)&beta, nullptr, lda, b_, ldb); }); }); @@ -306,9 +306,9 @@ inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int6 } #define OMATCOPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ - const std::vector &dependencies) { \ + sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, const TYPE* a, int64_t lda, TYPE* b, int64_t ldb, \ + const std::vector& dependencies) { \ return omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); \ } @@ -320,16 +320,16 @@ OMATCOPY_LAUNCHER_USM(std::complex, rocblas_zgeam) #undef OMATCOPY_LAUNCHER_USM template -sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b, - int64_t ldb, int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, int64_t stridea, T* b, + int64_t ldb, int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, \ - int64_t strideb, const std::vector &dependencies) { \ + sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, \ + int64_t strideb, const std::vector& dependencies) { \ return omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \ lda, b, ldb, strideb, dependencies); \ } @@ -341,50 +341,50 @@ OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER_USM -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for column_major layout"); } template -inline sycl::event omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, const T *a, int64_t lda, - const T beta, const T *b, int64_t ldb, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event omatadd(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, const T* a, int64_t lda, + const T beta, const T* b, int64_t ldb, T* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, (rocDataType *)&alpha, a_, - lda, (rocDataType *)&beta, b_, ldb, c_, ldc); + get_rocblas_operation(transb), m, n, (rocDataType*)&alpha, a_, + lda, (rocDataType*)&beta, b_, ldb, c_, ldc); }); }); @@ -392,10 +392,10 @@ inline sycl::event omatadd(Func func, sycl::queue &queue, transpose transa, tran } #define OMATADD_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, const TYPE beta, \ - const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, const TYPE* a, int64_t lda, const TYPE beta, \ + const TYPE* b, int64_t ldb, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, \ c, ldc, dependencies); \ } @@ -413,72 +413,72 @@ namespace row_major { // Buffer APIs -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - int8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + int8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, int64_t m, - int64_t n, int64_t k, float alpha, sycl::buffer &a, int64_t lda, - uint8_t ao, sycl::buffer &b, int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, int64_t ldc, sycl::buffer &co) { +void gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, int64_t m, + int64_t n, int64_t k, float alpha, sycl::buffer& a, int64_t lda, + uint8_t ao, sycl::buffer& b, int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, int64_t ldc, sycl::buffer& co) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, float alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, float beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, float alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, float beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, double alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, double beta, sycl::buffer &c, +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, double alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, double beta, sycl::buffer& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -void gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, - int64_t k, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &b, int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, int64_t ldc) { +void gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, int64_t n, + int64_t k, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& b, int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, int64_t ldc) { throw unimplemented("blas", "gemmt", "for row_major layout"); } template -inline void omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, +inline void omatcopy(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { column_major::omatcopy(func, queue, trans, n, m, alpha, a, lda, b, ldb); } #define OMATCOPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, const TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, const TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& b, int64_t ldb) { \ omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb); \ } @@ -490,16 +490,16 @@ OMATCOPY_LAUNCHER(std::complex, rocblas_zgeam) #undef OMATCOPY_LAUNCHER template -void omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, std::int64_t stridea, - sycl::buffer &b, int64_t ldb, std::int64_t strideb) { +void omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, std::int64_t stridea, + sycl::buffer& b, int64_t ldb, std::int64_t strideb) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, int64_t stridea, \ - sycl::buffer &b, int64_t ldb, int64_t strideb) { \ + void omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, int64_t stridea, \ + sycl::buffer& b, int64_t ldb, int64_t strideb) { \ omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, lda, \ b, ldb, strideb); \ } @@ -511,37 +511,37 @@ OMATCOPY2_LAUNCHER(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - sycl::buffer &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + sycl::buffer& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -void imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, int64_t lda, int64_t ldb) { +void imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& ab, int64_t lda, int64_t ldb) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } template -inline void omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, const T alpha, sycl::buffer &a, int64_t lda, const T beta, - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { +inline void omatadd(Func func, sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, const T alpha, sycl::buffer& a, int64_t lda, const T beta, + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { column_major::omatadd(func, queue, transa, transb, n, m, alpha, a, lda, beta, b, ldb, c, ldc); } #define OMATADD_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - const TYPE alpha, sycl::buffer &a, int64_t lda, const TYPE beta, \ - sycl::buffer &b, int64_t ldb, sycl::buffer &c, int64_t ldc) { \ + void omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + const TYPE alpha, sycl::buffer& a, int64_t lda, const TYPE beta, \ + sycl::buffer& b, int64_t ldb, sycl::buffer& c, int64_t ldc) { \ omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, \ ldc); \ } @@ -555,79 +555,79 @@ OMATADD_LAUNCHER(std::complex, rocblas_zgeam) // USM APIs -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const int8_t *a, int64_t lda, - int8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const int8_t* a, int64_t lda, + int8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const int8_t *b, int64_t ldb, int8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const int8_t* b, int64_t ldb, int8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemm_bias(sycl::queue &queue, transpose transa, transpose transb, offset offsetc, - int64_t m, int64_t n, int64_t k, float alpha, const uint8_t *a, int64_t lda, - uint8_t ao, const uint8_t *b, int64_t ldb, uint8_t bo, float beta, int32_t *c, - int64_t ldc, const int32_t *co, - const std::vector &dependencies) { +sycl::event gemm_bias(sycl::queue& queue, transpose transa, transpose transb, offset offsetc, + int64_t m, int64_t n, int64_t k, float alpha, const uint8_t* a, int64_t lda, + uint8_t ao, const uint8_t* b, int64_t ldb, uint8_t bo, float beta, int32_t* c, + int64_t ldc, const int32_t* co, + const std::vector& dependencies) { throw unimplemented("blas", "gemm_bias", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, float alpha, const float *a, int64_t lda, const float *b, - int64_t ldb, float beta, float *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, float alpha, const float* a, int64_t lda, const float* b, + int64_t ldb, float beta, float* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } -sycl::event gemmt(sycl::queue &queue, uplo upper_lower, transpose transa, transpose transb, - int64_t n, int64_t k, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *b, int64_t ldb, - std::complex beta, std::complex *c, int64_t ldc, - const std::vector &dependencies) { +sycl::event gemmt(sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, + int64_t n, int64_t k, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* b, int64_t ldb, + std::complex beta, std::complex* c, int64_t ldc, + const std::vector& dependencies) { throw unimplemented("blas", "gemmt", "for row_major layout"); } template -inline sycl::event omatcopy(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - const T alpha, const T *a, int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event omatcopy(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + const T alpha, const T* a, int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { return column_major::omatcopy(func, queue, trans, n, m, alpha, a, lda, b, ldb, dependencies); } #define OMATCOPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, \ - const TYPE alpha, const TYPE *a, int64_t lda, TYPE *b, int64_t ldb, \ - const std::vector &dependencies) { \ + sycl::event omatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, \ + const TYPE alpha, const TYPE* a, int64_t lda, TYPE* b, int64_t ldb, \ + const std::vector& dependencies) { \ return omatcopy(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); \ } @@ -639,16 +639,16 @@ OMATCOPY_LAUNCHER_USM(std::complex, rocblas_zgeam) #undef OMATCOPY_LAUNCHER_USM template -sycl::event omatcopy2(const char *func_name, Func func, sycl::queue &queue, transpose trans, - int64_t m, int64_t n, T alpha, const T *a, int64_t lda, int64_t stridea, T *b, - int64_t ldb, int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(const char* func_name, Func func, sycl::queue& queue, transpose trans, + int64_t m, int64_t n, T alpha, const T* a, int64_t lda, int64_t stridea, T* b, + int64_t ldb, int64_t strideb, const std::vector& dependencies) { throw unimplemented("blas", "omatcopy2", ""); } #define OMATCOPY2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatcopy2(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, int64_t stridea, TYPE *b, int64_t ldb, \ - int64_t strideb, const std::vector &dependencies) { \ + sycl::event omatcopy2(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, int64_t stridea, TYPE* b, int64_t ldb, \ + int64_t strideb, const std::vector& dependencies) { \ return omatcopy2(#ROCBLAS_ROUTINE, ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, stridea, \ lda, b, ldb, strideb, dependencies); \ } @@ -660,44 +660,44 @@ OMATCOPY2_LAUNCHER_USM(std::complex, "unimplemented") #undef OMATCOPY2_LAUNCHER_USM -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, float alpha, - float *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, float alpha, + float* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, double alpha, - double *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, double alpha, + double* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } -sycl::event imatcopy(sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, std::complex *ab, int64_t lda, int64_t ldb, - const std::vector &dependencies) { +sycl::event imatcopy(sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, std::complex* ab, int64_t lda, int64_t ldb, + const std::vector& dependencies) { throw unimplemented("blas", "imatcopy", "for row_major layout"); } template -inline sycl::event omatadd(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, const T alpha, const T *a, int64_t lda, - const T beta, const T *b, int64_t ldb, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event omatadd(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, const T alpha, const T* a, int64_t lda, + const T beta, const T* b, int64_t ldb, T* c, int64_t ldc, + const std::vector& dependencies) { return column_major::omatadd(func, queue, transa, transb, n, m, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } #define OMATADD_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event omatadd(sycl::queue &queue, transpose transa, transpose transb, int64_t m, \ - int64_t n, const TYPE alpha, const TYPE *a, int64_t lda, const TYPE beta, \ - const TYPE *b, int64_t ldb, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event omatadd(sycl::queue& queue, transpose transa, transpose transb, int64_t m, \ + int64_t n, const TYPE alpha, const TYPE* a, int64_t lda, const TYPE beta, \ + const TYPE* b, int64_t ldb, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return omatadd(ROCBLAS_ROUTINE, queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, \ c, ldc, dependencies); \ } diff --git a/src/blas/backends/rocblas/rocblas_handle.hpp b/src/blas/backends/rocblas/rocblas_handle.hpp index 7a8dfe91f..cde400bfb 100644 --- a/src/blas/backends/rocblas/rocblas_handle.hpp +++ b/src/blas/backends/rocblas/rocblas_handle.hpp @@ -30,10 +30,10 @@ namespace rocblas { template struct rocblas_handle_ { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t rocblas_handle_mapper_{}; ~rocblas_handle_() noexcept(false) { - for (auto &handle_pair : rocblas_handle_mapper_) { + for (auto& handle_pair : rocblas_handle_mapper_) { rocblas_status err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); diff --git a/src/blas/backends/rocblas/rocblas_helper.hpp b/src/blas/backends/rocblas/rocblas_helper.hpp index ae6301a7a..4f993697e 100644 --- a/src/blas/backends/rocblas/rocblas_helper.hpp +++ b/src/blas/backends/rocblas/rocblas_helper.hpp @@ -77,7 +77,7 @@ void overflow_check(Index index, Next... indices) { class rocblas_error : virtual public std::runtime_error { protected: - inline const char *rocblas_error_map(rocblas_status error) { + inline const char* rocblas_error_map(rocblas_status error) { switch (error) { case rocblas_status_success: return "rocblas_status_success"; case rocblas_status_invalid_handle: return "rocblas_status_invalid_handle"; @@ -124,7 +124,7 @@ class rocblas_error : virtual public std::runtime_error { class hip_error : virtual public std::runtime_error { protected: - inline const char *hip_error_map(hipError_t result) { + inline const char* hip_error_map(hipError_t result) { return hipGetErrorName(result); } int error_number; ///< error number diff --git a/src/blas/backends/rocblas/rocblas_level1.cpp b/src/blas/backends/rocblas/rocblas_level1.cpp index 3a1eacb38..be0e8638b 100644 --- a/src/blas/backends/rocblas/rocblas_level1.cpp +++ b/src/blas/backends/rocblas/rocblas_level1.cpp @@ -34,16 +34,16 @@ namespace column_major { // Buffer APIs template -inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void asum(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host // when the data is on buffer, it must be set to @@ -51,8 +51,8 @@ inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); rocblas_status err; // ASUM does not support negative index ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -65,8 +65,8 @@ inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer & } #define ASUM_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ asum(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -78,26 +78,26 @@ ASUM_LAUNCHER(std::complex, double, rocblas_dzasum) #undef ASUM_LAUNCHER template -inline void scal(Func func, sycl::queue &queue, int64_t n, T1 a, sycl::buffer &x, +inline void scal(Func func, sycl::queue& queue, int64_t n, T1 a, sycl::buffer& x, int64_t incx) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; // SCAL does not support negative incx - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1 *)&a, x_, std::abs(incx)); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1*)&a, x_, std::abs(incx)); }); }); } #define SCAL_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer &x, int64_t incx) { \ + void scal(sycl::queue& queue, int64_t n, TYPE1 a, sycl::buffer& x, int64_t incx) { \ scal(ROCBLAS_ROUTINE, queue, n, a, x, incx); \ } @@ -111,29 +111,28 @@ SCAL_LAUNCHER(double, std::complex, rocblas_zdscal) #undef SCAL_LAUNCHER template -inline void axpy(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +inline void axpy(Func func, sycl::queue& queue, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, y_, - incy); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType*)&alpha, x_, incx, y_, incy); }); }); } #define AXPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void axpy(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy); \ } @@ -144,40 +143,40 @@ AXPY_LAUNCHER(std::complex, rocblas_zaxpy) #undef AXPY_LAUNCHER -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for column_major layout"); } template -inline void rotg(Func func, sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { +inline void rotg(Func func, sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto s_acc = s.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host // when the data is on buffer, it must be set to @@ -185,10 +184,10 @@ inline void rotg(Func func, sycl::queue &queue, sycl::buffer &a, sycl::bu // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); - auto s_ = sc.get_mem(s_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); + auto s_ = sc.get_mem(s_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, a_, b_, c_, s_); // Higher level BLAS functions expect rocblas_pointer_mode_host @@ -200,8 +199,8 @@ inline void rotg(Func func, sycl::queue &queue, sycl::buffer &a, sycl::bu } #define ROTG_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, \ - sycl::buffer &c, sycl::buffer &s) { \ + void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, \ + sycl::buffer& c, sycl::buffer& s) { \ rotg(ROCBLAS_ROUTINE, queue, a, b, c, s); \ } @@ -213,16 +212,16 @@ ROTG_LAUNCHER(std::complex, double, rocblas_zrotg) #undef ROTG_LAUNCHER template -inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { +inline void rotm(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -231,9 +230,9 @@ inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto param_ = sc.get_mem(param_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto param_ = sc.get_mem(param_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, param_); // Higher level BLAS functions expect rocblas_pointer_mode_host @@ -245,8 +244,8 @@ inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x } #define ROTM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { \ + void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy, sycl::buffer& param) { \ rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param); \ } @@ -256,19 +255,19 @@ ROTM_LAUNCHER(double, rocblas_drotm) #undef ROTM_LAUNCHER template -inline void copy(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { +inline void copy(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -276,8 +275,8 @@ inline void copy(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x } #define COPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -289,16 +288,16 @@ COPY_LAUNCHER(std::complex, rocblas_zcopy) #undef COPY_LAUNCHER template -inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +inline void dot(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -307,9 +306,9 @@ inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect rocblas_pointer_mode_host @@ -321,8 +320,8 @@ inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, } #define DOT_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE) \ - void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, const int64_t incy, sycl::buffer &result) { \ + void dot##EXT(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, const int64_t incy, sycl::buffer& result) { \ dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result); \ } @@ -335,23 +334,23 @@ DOT_LAUNCHER(c, std::complex, rocblas_zdotc) #undef DOT_LAUNCHER -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "dot", "for column_major layout"); } template -inline void rot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &y, int64_t incy, T2 c, T3 s) { +inline void rot(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& y, int64_t incy, T2 c, T3 s) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; using rocDataType3 = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host // when the data is on buffer, it must be set to @@ -359,18 +358,18 @@ inline void rot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. // rocblas_set_pointer_mode(handle, rocblas_set_pointer_mode); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2 *)&c, - (rocDataType3 *)&s); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2*)&c, + (rocDataType3*)&s); }); }); } #define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE) \ - void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, int64_t incy, TYPE2 c, TYPE3 s) { \ + void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, int64_t incy, TYPE2 c, TYPE3 s) { \ rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s); \ } @@ -381,16 +380,16 @@ ROT_LAUNCHER(std::complex, double, double, rocblas_zdrot) #undef ROT_LAUNCHER -void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { overflow_check(n, incx, incy); // rocBLAS does not support sdot so we need to mimic sdot. - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.get_access(cgh); auto y_acc = y.get_access(cgh); auto res_acc = result.get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -399,9 +398,9 @@ void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); + auto res_ = sc.get_mem(res_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(rocblas_sdot, err, handle, n, x_, incx, y_, incy, res_); // Higher level BLAS functions expect rocblas_pointer_mode_host @@ -417,18 +416,18 @@ void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, } template -inline void rotmg(Func func, sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, T y1, sycl::buffer ¶m) { +inline void rotmg(Func func, sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, T y1, sycl::buffer& param) { using rocDataType = typename RocEquivalentType::Type; sycl::buffer y1_buff(&y1, sycl::range<1>(1)); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto d1_acc = d1.template get_access(cgh); auto d2_acc = d2.template get_access(cgh); auto x1_acc = x1.template get_access(cgh); auto y1_acc = y1_buff.template get_access(cgh); auto param_acc = param.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -437,11 +436,11 @@ inline void rotmg(Func func, sycl::queue &queue, sycl::buffer &d1, sycl::b // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto d1_ = sc.get_mem(d1_acc); - auto d2_ = sc.get_mem(d2_acc); - auto x1_ = sc.get_mem(x1_acc); - auto y1_ = sc.get_mem(y1_acc); - auto param_ = sc.get_mem(param_acc); + auto d1_ = sc.get_mem(d1_acc); + auto d2_ = sc.get_mem(d2_acc); + auto x1_ = sc.get_mem(x1_acc); + auto y1_ = sc.get_mem(y1_acc); + auto param_ = sc.get_mem(param_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, d1_, d2_, x1_, y1_, param_); // Higher level BLAS functions expect rocblas_pointer_mode_host @@ -453,8 +452,8 @@ inline void rotmg(Func func, sycl::queue &queue, sycl::buffer &d1, sycl::b } #define ROTMG_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, \ - sycl::buffer &x1, TYPE y1, sycl::buffer ¶m) { \ + void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, \ + sycl::buffer& x1, TYPE y1, sycl::buffer& param) { \ rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param); \ } @@ -464,8 +463,8 @@ ROTMG_LAUNCHER(double, rocblas_drotmg) #undef ROTMG_LAUNCHER template -inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void iamax(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); @@ -477,10 +476,10 @@ inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // to elementwise copy the data between two buffer, or allow reinterpret cast // to convert to different type with different typesize size. sycl::buffer int_res_buff{ sycl::range<1>(1) }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -489,8 +488,8 @@ inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto int_res_ = sc.get_mem(int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); rocblas_status err; // For negative incx, iamax returns 0. This behaviour is similar to that of // reference netlib BLAS. @@ -502,7 +501,7 @@ inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer & }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto int_res_acc = int_res_buff.template get_access(cgh); auto result_acc = result.template get_access(cgh); cgh.single_task( @@ -511,8 +510,8 @@ inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer & } #define IAMAX_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -524,19 +523,19 @@ IAMAX_LAUNCHER(std::complex, rocblas_izamax) #undef IAMAX_LAUNCHER template -inline void swap(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { +inline void swap(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -544,8 +543,8 @@ inline void swap(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x } #define SWAP_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -557,8 +556,8 @@ SWAP_LAUNCHER(std::complex, rocblas_zswap) #undef SWAP_LAUNCHER template -inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void iamin(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); @@ -570,10 +569,10 @@ inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // to elementwise copy the data between two buffer, or allow reinterpret cast // to convert to different type with different typesize size. sycl::buffer int_res_buff{ sycl::range<1>(1) }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto int_res_acc = int_res_buff.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -582,8 +581,8 @@ inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto int_res_ = sc.get_mem(int_res_acc); + auto x_ = sc.get_mem(x_acc); + auto int_res_ = sc.get_mem(int_res_acc); rocblas_status err; // For negative incx, iamin returns 0. This behaviour is similar to that of // implemented as a reference IAMIN. @@ -595,7 +594,7 @@ inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer & }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto int_res_acc = int_res_buff.template get_access(cgh); auto result_acc = result.template get_access(cgh); cgh.single_task( @@ -604,8 +603,8 @@ inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer & } #define IAMIN_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -617,16 +616,16 @@ IAMIN_LAUNCHER(std::complex, rocblas_izamin) #undef IAMIN_LAUNCHER template -inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void nrm2(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto x_acc = x.template get_access(cgh); auto res_acc = result.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); // By default the pointer mode is the rocblas_pointer_mode_host @@ -635,8 +634,8 @@ inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer & // fault. When it is set to device it is users responsibility to // synchronise as the function is completely asynchronous. rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = sc.get_mem(x_acc); - auto res_ = sc.get_mem(res_acc); + auto x_ = sc.get_mem(x_acc); + auto res_ = sc.get_mem(res_acc); rocblas_status err; // NRM2 does not support negative index ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -649,8 +648,8 @@ inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer & } #define NRM2_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -664,20 +663,20 @@ NRM2_LAUNCHER(std::complex, double, rocblas_dznrm2) // USM APIs template -inline sycl::event asum(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx, - T2 *result, const std::vector &dependencies) { +inline sycl::event asum(Func func, sycl::queue& queue, int64_t n, const T1* x, const int64_t incx, + T2* result, const std::vector& dependencies) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = reinterpret_cast(x); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto res_ = reinterpret_cast(result); rocblas_status err; // ASUM does not support negative index ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -689,8 +688,8 @@ inline sycl::event asum(Func func, sycl::queue &queue, int64_t n, const T1 *x, c } #define ASUM_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event asum(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return asum(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -702,21 +701,21 @@ ASUM_LAUNCHER_USM(std::complex, double, rocblas_dzasum) #undef ASUM_LAUNCHER_USM template -inline sycl::event scal(Func func, sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event scal(Func func, sycl::queue& queue, int64_t n, T1 a, T2* x, int64_t incx, + const std::vector& dependencies) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); + auto x_ = reinterpret_cast(x); rocblas_status err; // SCAL does not support negative incx - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1 *)&a, x_, std::abs(incx)); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType1*)&a, x_, std::abs(incx)); }); }); @@ -724,8 +723,8 @@ inline sycl::event scal(Func func, sycl::queue &queue, int64_t n, T1 a, T2 *x, i } #define SCAL_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event scal(sycl::queue& queue, int64_t n, TYPE1 a, TYPE2* x, int64_t incx, \ + const std::vector& dependencies) { \ return scal(ROCBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \ } @@ -739,21 +738,20 @@ SCAL_LAUNCHER_USM(double, std::complex, rocblas_zdscal) #undef SCAL_LAUNCHER_USM template -inline sycl::event axpy(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, int64_t incx, - T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event axpy(Func func, sycl::queue& queue, int64_t n, T alpha, const T* x, int64_t incx, + T* y, int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType *)&alpha, x_, incx, y_, - incy); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, (rocDataType*)&alpha, x_, incx, y_, incy); }); }); @@ -761,8 +759,8 @@ inline sycl::event axpy(Func func, sycl::queue &queue, int64_t n, T alpha, const } #define AXPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - TYPE *y, int64_t incy, const std::vector &dependencies) { \ + sycl::event axpy(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + TYPE* y, int64_t incy, const std::vector& dependencies) { \ return axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies); \ } @@ -773,44 +771,44 @@ AXPY_LAUNCHER_USM(std::complex, rocblas_zaxpy) #undef AXPY_LAUNCHER_USM -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - float beta, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for column_major layout"); } template -inline sycl::event rotg(Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s, - const std::vector &dependencies) { +inline sycl::event rotg(Func func, sycl::queue& queue, T1* a, T1* b, T2* c, T1* s, + const std::vector& dependencies) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); - auto s_ = reinterpret_cast(s); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); + auto s_ = reinterpret_cast(s); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, a_, b_, c_, s_); }); @@ -820,8 +818,8 @@ inline sycl::event rotg(Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 * } #define ROTG_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \ - const std::vector &dependencies) { \ + sycl::event rotg(sycl::queue& queue, TYPE1* a, TYPE1* b, TYPE2* c, TYPE1* s, \ + const std::vector& dependencies) { \ return rotg(ROCBLAS_ROUTINE, queue, a, b, c, s, dependencies); \ } @@ -833,19 +831,19 @@ ROTG_LAUNCHER_USM(std::complex, double, rocblas_zrotg) #undef ROTG_LAUNCHER_USM template -inline sycl::event rotm(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, - int64_t incy, T *param, const std::vector &dependencies) { +inline sycl::event rotm(Func func, sycl::queue& queue, int64_t n, T* x, int64_t incx, T* y, + int64_t incy, T* param, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto param_ = reinterpret_cast(param); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto param_ = reinterpret_cast(param); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, param_); }); @@ -855,8 +853,8 @@ inline sycl::event rotm(Func func, sycl::queue &queue, int64_t n, T *x, int64_t } #define ROTM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - TYPE *param, const std::vector &dependencies) { \ + sycl::event rotm(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + TYPE* param, const std::vector& dependencies) { \ return rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies); \ } @@ -866,18 +864,18 @@ ROTM_LAUNCHER_USM(double, rocblas_drotm) #undef ROTM_LAUNCHER_USM template -inline sycl::event copy(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event copy(Func func, sycl::queue& queue, int64_t n, const T* x, int64_t incx, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -887,8 +885,8 @@ inline sycl::event copy(Func func, sycl::queue &queue, int64_t n, const T *x, in } #define COPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event copy(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -900,20 +898,20 @@ COPY_LAUNCHER_USM(std::complex, rocblas_zcopy) #undef COPY_LAUNCHER_USM template -inline sycl::event dot(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - const T *y, int64_t incy, T *result, - const std::vector &dependencies) { +inline sycl::event dot(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + const T* y, int64_t incy, T* result, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto res_ = reinterpret_cast(result); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, res_); }); @@ -923,9 +921,9 @@ inline sycl::event dot(Func func, sycl::queue &queue, int64_t n, const T *x, con } #define DOT_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE) \ - sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - const TYPE *y, const int64_t incy, TYPE *result, \ - const std::vector &dependencies) { \ + sycl::event dot##EXT(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + const TYPE* y, const int64_t incy, TYPE* result, \ + const std::vector& dependencies) { \ return dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies); \ } @@ -938,29 +936,29 @@ DOT_LAUNCHER_USM(c, std::complex, rocblas_zdotc) #undef DOT_LAUNCHER_USM -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies) { throw unimplemented("blas", "dot", "for column_major layout"); } template -inline sycl::event rot(Func func, sycl::queue &queue, int64_t n, T1 *x, const int64_t incx, T1 *y, - int64_t incy, T2 c, T3 s, const std::vector &dependencies) { +inline sycl::event rot(Func func, sycl::queue& queue, int64_t n, T1* x, const int64_t incx, T1* y, + int64_t incy, T2 c, T3 s, const std::vector& dependencies) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; using rocDataType3 = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2 *)&c, - (rocDataType3 *)&s); + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy, (rocDataType2*)&c, + (rocDataType3*)&s); }); }); @@ -968,9 +966,9 @@ inline sycl::event rot(Func func, sycl::queue &queue, int64_t n, T1 *x, const in } #define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE) \ - sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \ + sycl::event rot(sycl::queue& queue, int64_t n, TYPE1* x, const int64_t incx, TYPE1* y, \ int64_t incy, TYPE2 c, TYPE3 s, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies); \ } @@ -981,20 +979,20 @@ ROT_LAUNCHER_USM(std::complex, double, double, rocblas_zdrot) #undef ROT_LAUNCHER_USM -sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, - const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { overflow_check(n, incx, incy); // rocBLAS does not support sdot so we need to mimic sdot. - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); + auto res_ = reinterpret_cast(result); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(rocblas_sdot, err, handle, n, x_, incx, y_, incy, res_); }); @@ -1006,20 +1004,20 @@ sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int6 } template -inline sycl::event rotmg(Func func, sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param, - const std::vector &dependencies) { +inline sycl::event rotmg(Func func, sycl::queue& queue, T* d1, T* d2, T* x1, T y1, T* param, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto d1_ = reinterpret_cast(d1); - auto d2_ = reinterpret_cast(d2); - auto x1_ = reinterpret_cast(x1); - auto y1_ = reinterpret_cast(&y1); - auto param_ = reinterpret_cast(param); + auto d1_ = reinterpret_cast(d1); + auto d2_ = reinterpret_cast(d2); + auto x1_ = reinterpret_cast(x1); + auto y1_ = reinterpret_cast(&y1); + auto param_ = reinterpret_cast(param); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, d1_, d2_, x1_, y1_, param_); }); @@ -1029,8 +1027,8 @@ inline sycl::event rotmg(Func func, sycl::queue &queue, T *d1, T *d2, T *x1, T y } #define ROTMG_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \ - const std::vector &dependencies) { \ + sycl::event rotmg(sycl::queue& queue, TYPE* d1, TYPE* d2, TYPE* x1, TYPE y1, TYPE* param, \ + const std::vector& dependencies) { \ return rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \ } @@ -1040,8 +1038,8 @@ ROTMG_LAUNCHER_USM(double, rocblas_drotmg) #undef ROTMG_LAUNCHER_USM template -inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - int64_t *result, const std::vector &dependencies) { +inline sycl::event iamax(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + int64_t* result, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); // rocBLAS does not support int64_t as return type for the data by default. So we need to @@ -1049,17 +1047,17 @@ inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, c // it back to the actual data on the host. // This change may cause failure as the result of integer overflow // based on the size. - auto int_res_p = (int *)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(), - queue.get_context()); + auto int_res_p = (int*)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(), + queue.get_context()); *int_res_p = 0; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = reinterpret_cast(x); - auto int_res_p_ = reinterpret_cast(int_res_p); + auto x_ = reinterpret_cast(x); + auto int_res_p_ = reinterpret_cast(int_res_p); rocblas_status err; // For negative incx, iamax returns 0. This behaviour is similar to that of // reference iamax. @@ -1074,8 +1072,8 @@ inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, c } #define IAMAX_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamax(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1087,18 +1085,18 @@ IAMAX_LAUNCHER_USM(std::complex, rocblas_izamax) #undef IAMAX_LAUNCHER_USM template -inline sycl::event swap(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event swap(Func func, sycl::queue& queue, int64_t n, T* x, int64_t incx, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, incx, y_, incy); }); @@ -1108,8 +1106,8 @@ inline sycl::event swap(Func func, sycl::queue &queue, int64_t n, T *x, int64_t } #define SWAP_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event swap(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1121,8 +1119,8 @@ SWAP_LAUNCHER_USM(std::complex, rocblas_zswap) #undef SWAP_LAUNCHER_USM template -inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - int64_t *result, const std::vector &dependencies) { +inline sycl::event iamin(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + int64_t* result, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); // rocBLAS does not support int64_t as return type for the data by default. So we need to @@ -1130,18 +1128,18 @@ inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, c // it back to the actual data on the host. // This change may cause failure as the result of integer overflow // based on the size. - auto int_res_p = (int *)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(), - queue.get_context()); + auto int_res_p = (int*)sycl::aligned_alloc_shared(64, sizeof(rocblas_int), queue.get_device(), + queue.get_context()); *int_res_p = 0; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = reinterpret_cast(x); - auto int_res_p_ = reinterpret_cast(int_res_p); + auto x_ = reinterpret_cast(x); + auto int_res_p_ = reinterpret_cast(int_res_p); rocblas_status err; // For negative incx, iamin returns 0. This behaviour is similar to that of // implemented iamin. @@ -1156,8 +1154,8 @@ inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, c } #define IAMIN_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamin(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1169,20 +1167,20 @@ IAMIN_LAUNCHER_USM(std::complex, rocblas_izamin) #undef IAMIN_LAUNCHER_USM template -inline sycl::event nrm2(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx, - T2 *result, const std::vector &dependencies) { +inline sycl::event nrm2(Func func, sycl::queue& queue, int64_t n, const T1* x, const int64_t incx, + T2* result, const std::vector& dependencies) { using rocDataType1 = typename RocEquivalentType::Type; using rocDataType2 = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); rocblas_set_pointer_mode(handle, rocblas_pointer_mode_device); - auto x_ = reinterpret_cast(x); - auto res_ = reinterpret_cast(result); + auto x_ = reinterpret_cast(x); + auto res_ = reinterpret_cast(result); rocblas_status err; // NRM2 does not support negative index ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, n, x_, std::abs(incx), res_); @@ -1194,8 +1192,8 @@ inline sycl::event nrm2(Func func, sycl::queue &queue, int64_t n, const T1 *x, c } #define NRM2_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event nrm2(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1212,14 +1210,14 @@ namespace row_major { // Buffer APIs template -inline void asum(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void asum(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { column_major::asum(func, queue, n, x, incx, result); } #define ASUM_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void asum(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void asum(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ asum(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -1231,13 +1229,13 @@ ASUM_LAUNCHER(std::complex, double, rocblas_dzasum) #undef ASUM_LAUNCHER template -inline void scal(Func func, sycl::queue &queue, int64_t n, T1 a, sycl::buffer &x, +inline void scal(Func func, sycl::queue& queue, int64_t n, T1 a, sycl::buffer& x, int64_t incx) { column_major::scal(func, queue, n, a, x, incx); } #define SCAL_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void scal(sycl::queue &queue, int64_t n, TYPE1 a, sycl::buffer &x, int64_t incx) { \ + void scal(sycl::queue& queue, int64_t n, TYPE1 a, sycl::buffer& x, int64_t incx) { \ scal(ROCBLAS_ROUTINE, queue, n, a, x, incx); \ } @@ -1251,14 +1249,14 @@ SCAL_LAUNCHER(double, std::complex, rocblas_zdscal) #undef SCAL_LAUNCHER template -inline void axpy(Func func, sycl::queue &queue, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy) { +inline void axpy(Func func, sycl::queue& queue, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy) { column_major::axpy(func, queue, n, alpha, x, incx, y, incy); } #define AXPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void axpy(sycl::queue &queue, int64_t n, TYPE alpha, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void axpy(sycl::queue& queue, int64_t n, TYPE alpha, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy); \ } @@ -1269,37 +1267,37 @@ AXPY_LAUNCHER(std::complex, rocblas_zaxpy) #undef AXPY_LAUNCHER -void axpby(sycl::queue &queue, int64_t n, float alpha, sycl::buffer &x, int64_t incx, - float beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, float alpha, sycl::buffer& x, int64_t incx, + float beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, double alpha, sycl::buffer &x, int64_t incx, - double beta, sycl::buffer &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, double alpha, sycl::buffer& x, int64_t incx, + double beta, sycl::buffer& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } -void axpby(sycl::queue &queue, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +void axpby(sycl::queue& queue, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { throw unimplemented("blas", "axpby", "for row_major layout"); } template -inline void rotg(Func func, sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, - sycl::buffer &c, sycl::buffer &s) { +inline void rotg(Func func, sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, + sycl::buffer& c, sycl::buffer& s) { column_major::rotg(func, queue, a, b, c, s); } #define ROTG_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void rotg(sycl::queue &queue, sycl::buffer &a, sycl::buffer &b, \ - sycl::buffer &c, sycl::buffer &s) { \ + void rotg(sycl::queue& queue, sycl::buffer& a, sycl::buffer& b, \ + sycl::buffer& c, sycl::buffer& s) { \ rotg(ROCBLAS_ROUTINE, queue, a, b, c, s); \ } @@ -1311,14 +1309,14 @@ ROTG_LAUNCHER(std::complex, double, rocblas_zrotg) #undef ROTG_LAUNCHER template -inline void rotm(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { +inline void rotm(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& param) { column_major::rotm(func, queue, n, x, incx, y, incy, param); } #define ROTM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void rotm(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy, sycl::buffer ¶m) { \ + void rotm(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy, sycl::buffer& param) { \ rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param); \ } @@ -1328,14 +1326,14 @@ ROTM_LAUNCHER(double, rocblas_drotm) #undef ROTM_LAUNCHER template -inline void copy(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { +inline void copy(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { column_major::copy(func, queue, n, x, incx, y, incy); } #define COPY_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void copy(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void copy(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -1347,14 +1345,14 @@ COPY_LAUNCHER(std::complex, rocblas_zcopy) #undef COPY_LAUNCHER template -inline void dot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +inline void dot(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { column_major::dot(func, queue, n, x, incx, y, incy, result); } #define DOT_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE) \ - void dot##EXT(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, const int64_t incy, sycl::buffer &result) { \ + void dot##EXT(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, const int64_t incy, sycl::buffer& result) { \ dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result); \ } @@ -1367,20 +1365,20 @@ DOT_LAUNCHER(c, std::complex, rocblas_zdotc) #undef DOT_LAUNCHER -void dot(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void dot(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { throw unimplemented("blas", "dot", "for row_major layout"); } template -inline void rot(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &y, int64_t incy, T2 c, T3 s) { +inline void rot(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& y, int64_t incy, T2 c, T3 s) { column_major::rot(func, queue, n, x, incx, y, incy, c, s); } #define ROT_LAUNCHER(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE) \ - void rot(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &y, int64_t incy, TYPE2 c, TYPE3 s) { \ + void rot(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& y, int64_t incy, TYPE2 c, TYPE3 s) { \ rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s); \ } @@ -1391,20 +1389,20 @@ ROT_LAUNCHER(std::complex, double, double, rocblas_zdrot) #undef ROT_LAUNCHER -void sdsdot(sycl::queue &queue, int64_t n, float sb, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy, sycl::buffer &result) { +void sdsdot(sycl::queue& queue, int64_t n, float sb, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy, sycl::buffer& result) { column_major::sdsdot(queue, n, sb, x, incx, y, incy, result); } template -inline void rotmg(Func func, sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, - sycl::buffer &x1, T y1, sycl::buffer ¶m) { +inline void rotmg(Func func, sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, + sycl::buffer& x1, T y1, sycl::buffer& param) { column_major::rotmg(func, queue, d1, d2, x1, y1, param); } #define ROTMG_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void rotmg(sycl::queue &queue, sycl::buffer &d1, sycl::buffer &d2, \ - sycl::buffer &x1, TYPE y1, sycl::buffer ¶m) { \ + void rotmg(sycl::queue& queue, sycl::buffer& d1, sycl::buffer& d2, \ + sycl::buffer& x1, TYPE y1, sycl::buffer& param) { \ rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param); \ } @@ -1414,14 +1412,14 @@ ROTMG_LAUNCHER(double, rocblas_drotmg) #undef ROTMG_LAUNCHER template -inline void iamax(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void iamax(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { column_major::iamax(func, queue, n, x, incx, result); } #define IAMAX_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void iamax(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamax(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -1433,14 +1431,14 @@ IAMAX_LAUNCHER(std::complex, rocblas_izamax) #undef IAMAX_LAUNCHER template -inline void swap(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, - sycl::buffer &y, int64_t incy) { +inline void swap(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, + sycl::buffer& y, int64_t incy) { column_major::swap(func, queue, n, x, incx, y, incy); } #define SWAP_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void swap(sycl::queue &queue, int64_t n, sycl::buffer &x, int64_t incx, \ - sycl::buffer &y, int64_t incy) { \ + void swap(sycl::queue& queue, int64_t n, sycl::buffer& x, int64_t incx, \ + sycl::buffer& y, int64_t incy) { \ swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy); \ } @@ -1452,14 +1450,14 @@ SWAP_LAUNCHER(std::complex, rocblas_zswap) #undef SWAP_LAUNCHER template -inline void iamin(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void iamin(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { column_major::iamin(func, queue, n, x, incx, result); } #define IAMIN_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void iamin(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void iamin(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -1471,14 +1469,14 @@ IAMIN_LAUNCHER(std::complex, rocblas_izamin) #undef IAMIN_LAUNCHER template -inline void nrm2(Func func, sycl::queue &queue, int64_t n, sycl::buffer &x, - const int64_t incx, sycl::buffer &result) { +inline void nrm2(Func func, sycl::queue& queue, int64_t n, sycl::buffer& x, + const int64_t incx, sycl::buffer& result) { column_major::nrm2(func, queue, n, x, incx, result); } #define NRM2_LAUNCHER(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - void nrm2(sycl::queue &queue, int64_t n, sycl::buffer &x, const int64_t incx, \ - sycl::buffer &result) { \ + void nrm2(sycl::queue& queue, int64_t n, sycl::buffer& x, const int64_t incx, \ + sycl::buffer& result) { \ nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result); \ } @@ -1492,14 +1490,14 @@ NRM2_LAUNCHER(std::complex, double, rocblas_dznrm2) // USM APIs template -inline sycl::event asum(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx, - T2 *result, const std::vector &dependencies) { +inline sycl::event asum(Func func, sycl::queue& queue, int64_t n, const T1* x, const int64_t incx, + T2* result, const std::vector& dependencies) { return column_major::asum(func, queue, n, x, incx, result, dependencies); } #define ASUM_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event asum(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event asum(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return asum(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1511,14 +1509,14 @@ ASUM_LAUNCHER_USM(std::complex, double, rocblas_dzasum) #undef ASUM_LAUNCHER_USM template -inline sycl::event scal(Func func, sycl::queue &queue, int64_t n, T1 a, T2 *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event scal(Func func, sycl::queue& queue, int64_t n, T1 a, T2* x, int64_t incx, + const std::vector& dependencies) { return column_major::scal(func, queue, n, a, x, incx, dependencies); } #define SCAL_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event scal(sycl::queue &queue, int64_t n, TYPE1 a, TYPE2 *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event scal(sycl::queue& queue, int64_t n, TYPE1 a, TYPE2* x, int64_t incx, \ + const std::vector& dependencies) { \ return scal(ROCBLAS_ROUTINE, queue, n, a, x, incx, dependencies); \ } @@ -1532,14 +1530,14 @@ SCAL_LAUNCHER_USM(double, std::complex, rocblas_zdscal) #undef SCAL_LAUNCHER_USM template -inline sycl::event axpy(Func func, sycl::queue &queue, int64_t n, T alpha, const T *x, int64_t incx, - T *y, int64_t incy, const std::vector &dependencies) { +inline sycl::event axpy(Func func, sycl::queue& queue, int64_t n, T alpha, const T* x, int64_t incx, + T* y, int64_t incy, const std::vector& dependencies) { return column_major::axpy(func, queue, n, alpha, x, incx, y, incy, dependencies); } #define AXPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event axpy(sycl::queue &queue, int64_t n, TYPE alpha, const TYPE *x, int64_t incx, \ - TYPE *y, int64_t incy, const std::vector &dependencies) { \ + sycl::event axpy(sycl::queue& queue, int64_t n, TYPE alpha, const TYPE* x, int64_t incx, \ + TYPE* y, int64_t incy, const std::vector& dependencies) { \ return axpy(ROCBLAS_ROUTINE, queue, n, alpha, x, incx, y, incy, dependencies); \ } @@ -1550,38 +1548,38 @@ AXPY_LAUNCHER_USM(std::complex, rocblas_zaxpy) #undef AXPY_LAUNCHER_USM -sycl::event axpby(sycl::queue &queue, int64_t n, float alpha, const float *x, int64_t incx, - float beta, float *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, float alpha, const float* x, int64_t incx, + float beta, float* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, double alpha, const double *x, int64_t incx, - double beta, double *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, double alpha, const double* x, int64_t incx, + double beta, double* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } -sycl::event axpby(sycl::queue &queue, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(sycl::queue& queue, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { throw unimplemented("blas", "axpby", "for row_major layout"); } template -inline sycl::event rotg(Func func, sycl::queue &queue, T1 *a, T1 *b, T2 *c, T1 *s, - const std::vector &dependencies) { +inline sycl::event rotg(Func func, sycl::queue& queue, T1* a, T1* b, T2* c, T1* s, + const std::vector& dependencies) { return column_major::rotg(func, queue, a, b, c, s, dependencies); } #define ROTG_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event rotg(sycl::queue &queue, TYPE1 *a, TYPE1 *b, TYPE2 *c, TYPE1 *s, \ - const std::vector &dependencies) { \ + sycl::event rotg(sycl::queue& queue, TYPE1* a, TYPE1* b, TYPE2* c, TYPE1* s, \ + const std::vector& dependencies) { \ return rotg(ROCBLAS_ROUTINE, queue, a, b, c, s, dependencies); \ } @@ -1593,14 +1591,14 @@ ROTG_LAUNCHER_USM(std::complex, double, rocblas_zrotg) #undef ROTG_LAUNCHER_USM template -inline sycl::event rotm(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, - int64_t incy, T *param, const std::vector &dependencies) { +inline sycl::event rotm(Func func, sycl::queue& queue, int64_t n, T* x, int64_t incx, T* y, + int64_t incy, T* param, const std::vector& dependencies) { return column_major::rotm(func, queue, n, x, incx, y, incy, param, dependencies); } #define ROTM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event rotm(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - TYPE *param, const std::vector &dependencies) { \ + sycl::event rotm(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + TYPE* param, const std::vector& dependencies) { \ return rotm(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, param, dependencies); \ } @@ -1610,14 +1608,14 @@ ROTM_LAUNCHER_USM(double, rocblas_drotm) #undef ROTM_LAUNCHER_USM template -inline sycl::event copy(Func func, sycl::queue &queue, int64_t n, const T *x, int64_t incx, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event copy(Func func, sycl::queue& queue, int64_t n, const T* x, int64_t incx, T* y, + int64_t incy, const std::vector& dependencies) { return column_major::copy(func, queue, n, x, incx, y, incy, dependencies); } #define COPY_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event copy(sycl::queue &queue, int64_t n, const TYPE *x, int64_t incx, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event copy(sycl::queue& queue, int64_t n, const TYPE* x, int64_t incx, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return copy(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1629,16 +1627,16 @@ COPY_LAUNCHER_USM(std::complex, rocblas_zcopy) #undef COPY_LAUNCHER_USM template -inline sycl::event dot(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - const T *y, int64_t incy, T *result, - const std::vector &dependencies) { +inline sycl::event dot(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + const T* y, int64_t incy, T* result, + const std::vector& dependencies) { return column_major::dot(func, queue, n, x, incx, y, incy, result, dependencies); } #define DOT_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE) \ - sycl::event dot##EXT(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - const TYPE *y, const int64_t incy, TYPE *result, \ - const std::vector &dependencies) { \ + sycl::event dot##EXT(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + const TYPE* y, const int64_t incy, TYPE* result, \ + const std::vector& dependencies) { \ return dot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, result, dependencies); \ } @@ -1651,21 +1649,21 @@ DOT_LAUNCHER_USM(c, std::complex, rocblas_zdotc) #undef DOT_LAUNCHER_USM -sycl::event dot(sycl::queue &queue, int64_t n, const float *x, int64_t incx, const float *y, - int64_t incy, double *result, const std::vector &dependencies) { +sycl::event dot(sycl::queue& queue, int64_t n, const float* x, int64_t incx, const float* y, + int64_t incy, double* result, const std::vector& dependencies) { throw unimplemented("blas", "dot", "for row_major layout"); } template -inline sycl::event rot(Func func, sycl::queue &queue, int64_t n, T1 *x, const int64_t incx, T1 *y, - int64_t incy, T2 c, T3 s, const std::vector &dependencies) { +inline sycl::event rot(Func func, sycl::queue& queue, int64_t n, T1* x, const int64_t incx, T1* y, + int64_t incy, T2 c, T3 s, const std::vector& dependencies) { return column_major::rot(func, queue, n, x, incx, y, incy, c, s, dependencies); } #define ROT_LAUNCHER_USM(TYPE1, TYPE2, TYPE3, ROCBLAS_ROUTINE) \ - sycl::event rot(sycl::queue &queue, int64_t n, TYPE1 *x, const int64_t incx, TYPE1 *y, \ + sycl::event rot(sycl::queue& queue, int64_t n, TYPE1* x, const int64_t incx, TYPE1* y, \ int64_t incy, TYPE2 c, TYPE3 s, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return rot(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, c, s, dependencies); \ } @@ -1676,21 +1674,21 @@ ROT_LAUNCHER_USM(std::complex, double, double, rocblas_zdrot) #undef ROT_LAUNCHER_USM -sycl::event sdsdot(sycl::queue &queue, int64_t n, float sb, const float *x, int64_t incx, - const float *y, int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event sdsdot(sycl::queue& queue, int64_t n, float sb, const float* x, int64_t incx, + const float* y, int64_t incy, float* result, + const std::vector& dependencies) { return column_major::sdsdot(queue, n, sb, x, incx, y, incy, result); } template -inline sycl::event rotmg(Func func, sycl::queue &queue, T *d1, T *d2, T *x1, T y1, T *param, - const std::vector &dependencies) { +inline sycl::event rotmg(Func func, sycl::queue& queue, T* d1, T* d2, T* x1, T y1, T* param, + const std::vector& dependencies) { return column_major::rotmg(func, queue, d1, d2, x1, y1, param, dependencies); } #define ROTMG_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event rotmg(sycl::queue &queue, TYPE *d1, TYPE *d2, TYPE *x1, TYPE y1, TYPE *param, \ - const std::vector &dependencies) { \ + sycl::event rotmg(sycl::queue& queue, TYPE* d1, TYPE* d2, TYPE* x1, TYPE y1, TYPE* param, \ + const std::vector& dependencies) { \ return rotmg(ROCBLAS_ROUTINE, queue, d1, d2, x1, y1, param, dependencies); \ } @@ -1700,14 +1698,14 @@ ROTMG_LAUNCHER_USM(double, rocblas_drotmg) #undef ROTMG_LAUNCHER_USM template -inline sycl::event iamax(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - int64_t *result, const std::vector &dependencies) { +inline sycl::event iamax(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + int64_t* result, const std::vector& dependencies) { return column_major::iamax(func, queue, n, x, incx, result, dependencies); } #define IAMAX_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event iamax(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamax(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamax(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1719,14 +1717,14 @@ IAMAX_LAUNCHER_USM(std::complex, rocblas_izamax) #undef IAMAX_LAUNCHER_USM template -inline sycl::event swap(Func func, sycl::queue &queue, int64_t n, T *x, int64_t incx, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event swap(Func func, sycl::queue& queue, int64_t n, T* x, int64_t incx, T* y, + int64_t incy, const std::vector& dependencies) { return column_major::swap(func, queue, n, x, incx, y, incy, dependencies); } #define SWAP_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event swap(sycl::queue &queue, int64_t n, TYPE *x, int64_t incx, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event swap(sycl::queue& queue, int64_t n, TYPE* x, int64_t incx, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return swap(ROCBLAS_ROUTINE, queue, n, x, incx, y, incy, dependencies); \ } @@ -1738,14 +1736,14 @@ SWAP_LAUNCHER_USM(std::complex, rocblas_zswap) #undef SWAP_LAUNCHER_USM template -inline sycl::event iamin(Func func, sycl::queue &queue, int64_t n, const T *x, const int64_t incx, - int64_t *result, const std::vector &dependencies) { +inline sycl::event iamin(Func func, sycl::queue& queue, int64_t n, const T* x, const int64_t incx, + int64_t* result, const std::vector& dependencies) { return column_major::iamin(func, queue, n, x, incx, result, dependencies); } #define IAMIN_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event iamin(sycl::queue &queue, int64_t n, const TYPE *x, const int64_t incx, \ - int64_t *result, const std::vector &dependencies) { \ + sycl::event iamin(sycl::queue& queue, int64_t n, const TYPE* x, const int64_t incx, \ + int64_t* result, const std::vector& dependencies) { \ return iamin(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } @@ -1757,14 +1755,14 @@ IAMIN_LAUNCHER_USM(std::complex, rocblas_izamin) #undef IAMIN_LAUNCHER_USM template -inline sycl::event nrm2(Func func, sycl::queue &queue, int64_t n, const T1 *x, const int64_t incx, - T2 *result, const std::vector &dependencies) { +inline sycl::event nrm2(Func func, sycl::queue& queue, int64_t n, const T1* x, const int64_t incx, + T2* result, const std::vector& dependencies) { return column_major::nrm2(func, queue, n, x, incx, result, dependencies); } #define NRM2_LAUNCHER_USM(TYPE1, TYPE2, ROCBLAS_ROUTINE) \ - sycl::event nrm2(sycl::queue &queue, int64_t n, const TYPE1 *x, const int64_t incx, \ - TYPE2 *result, const std::vector &dependencies) { \ + sycl::event nrm2(sycl::queue& queue, int64_t n, const TYPE1* x, const int64_t incx, \ + TYPE2* result, const std::vector& dependencies) { \ return nrm2(ROCBLAS_ROUTINE, queue, n, x, incx, result, dependencies); \ } diff --git a/src/blas/backends/rocblas/rocblas_level2.cpp b/src/blas/backends/rocblas/rocblas_level2.cpp index 882f7ff1c..b2a507890 100644 --- a/src/blas/backends/rocblas/rocblas_level2.cpp +++ b/src/blas/backends/rocblas/rocblas_level2.cpp @@ -28,7 +28,7 @@ // Helper Functions template -static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf, const int64_t len, +static inline void conj_vector(sycl::handler& cgh, sycl::buffer& buf, const int64_t len, const int64_t inc) { const auto abs_inc = std::abs(inc); auto acc = buf.template get_access(cgh); @@ -38,7 +38,7 @@ static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf, const i }); } template -static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, const int64_t inc) { +static inline void conj_vector(sycl::handler& cgh, T* ptr, const int64_t len, const int64_t inc) { const auto abs_inc = std::abs(inc); cgh.parallel_for(sycl::range{ (std::size_t)len }, [=](sycl::id<1> id) { const auto index = id * abs_inc; @@ -47,7 +47,7 @@ static inline void conj_vector(sycl::handler &cgh, T *ptr, const int64_t len, co } template -static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf_a, sycl::buffer &buf_b, +static inline void conj_vector(sycl::handler& cgh, sycl::buffer& buf_a, sycl::buffer& buf_b, const int64_t len, const int64_t inc_a, const int64_t inc_b) { const auto abs_inc_a = std::abs(inc_a); const auto abs_inc_b = std::abs(inc_b); @@ -61,7 +61,7 @@ static inline void conj_vector(sycl::handler &cgh, sycl::buffer &buf_a, sycl: }); } template -static inline void conj_vector(sycl::handler &cgh, T *ptr_a, T *ptr_b, const int64_t len, +static inline void conj_vector(sycl::handler& cgh, T* ptr_a, T* ptr_b, const int64_t len, const int64_t inc_a, const int64_t inc_b) { const auto abs_inc_a = std::abs(inc_a); const auto abs_inc_b = std::abs(inc_b); @@ -82,34 +82,34 @@ namespace column_major { // Buffer APIs template -inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define GEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -121,34 +121,34 @@ GEMV_LAUNCHER(std::complex, rocblas_zgemv) #undef GEMV_LAUNCHER template -inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, kl, ku, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, kl, ku, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define GBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ - int64_t incx, TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ + int64_t incx, TYPE beta, sycl::buffer& y, int64_t incy) { \ gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -160,32 +160,32 @@ GBMV_LAUNCHER(std::complex, rocblas_zgbmv) #undef GBMV_LAUNCHER template -inline void ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +inline void ger(Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType *)&alpha, x_, incx, y_, + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define GER_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE) \ - void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, \ + void ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, \ int64_t lda) { \ ger(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda); \ } @@ -200,34 +200,34 @@ GER_LAUNCHER(c, std::complex, rocblas_zgerc) #undef GER_LAUNCHER template -inline void hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define HBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -237,34 +237,34 @@ HBMV_LAUNCHER(std::complex, rocblas_zhbmv) #undef HBMV_LAUNCHER template -inline void hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hemv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define HEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -274,31 +274,31 @@ HEMV_LAUNCHER(std::complex, rocblas_zhemv) #undef HEMV_LAUNCHER template -inline void her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, +inline void her(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, ScalarType alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { using rocScalarType = typename RocEquivalentType::Type; using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocScalarType *)&alpha, x_, incx, a_, lda); + (rocScalarType*)&alpha, x_, incx, a_, lda); }); }); } #define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, \ + void her(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, \ int64_t lda) { \ her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -309,33 +309,33 @@ HER_LAUNCHER(double, std::complex, rocblas_zher) #undef HER_LAUNCHER template -inline void her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void her2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda); + (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define HER2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); \ } @@ -345,34 +345,34 @@ HER2_LAUNCHER(std::complex, rocblas_zher2) #undef HER2_LAUNCHER template -inline void hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hpmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_, + (rocDataType*)&alpha, a_, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define HPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); \ } @@ -382,30 +382,30 @@ HPMV_LAUNCHER(std::complex, rocblas_zhpmv) #undef HPMV_LAUNCHER template -inline void hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void hpr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, ScalarType alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a) { using rocScalarType = typename RocEquivalentType::Type; using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocScalarType *)&alpha, x_, incx, a_); + (rocScalarType*)&alpha, x_, incx, a_); }); }); } #define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -415,33 +415,33 @@ HPR_LAUNCHER(double, std::complex, rocblas_zhpr) #undef HPR_LAUNCHER template -inline void hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void hpr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_); + (rocDataType*)&alpha, x_, incx, y_, incy, a_); }); }); } #define HPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -451,34 +451,34 @@ HPR2_LAUNCHER(std::complex, rocblas_zhpr2) #undef HPR2_LAUNCHER template -inline void sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void sbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define SBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -488,34 +488,34 @@ SBMV_LAUNCHER(double, rocblas_dsbmv) #undef SBMV_LAUNCHER template -inline void symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void symv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define SYMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -525,29 +525,29 @@ SYMV_LAUNCHER(double, rocblas_dsymv) #undef SYMV_LAUNCHER template -inline void syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { +inline void syr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, a_, lda); + (rocDataType*)&alpha, x_, incx, a_, lda); }); }); } #define SYR_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { \ + void syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { \ syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -560,33 +560,33 @@ SYR_LAUNCHER(std::complex, rocblas_zsyr) #undef SYR_LAUNCHER template -inline void syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void syr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda); + (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); } #define SYR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); \ } @@ -599,34 +599,34 @@ SYR2_LAUNCHER(std::complex, rocblas_zsyr2) #undef SYR2_LAUNCHER template -inline void spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void spmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_, + (rocDataType*)&alpha, a_, x_, incx, (rocDataType*)&beta, y_, incy); }); }); } #define SPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); \ } @@ -636,29 +636,29 @@ SPMV_LAUNCHER(double, rocblas_dspmv) #undef SPMV_LAUNCHER template -inline void spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void spr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, a_); + (rocDataType*)&alpha, x_, incx, a_); }); }); } #define SPR_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -668,33 +668,33 @@ SPR_LAUNCHER(double, rocblas_dspr) #undef SPR_LAUNCHER template -inline void spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void spr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); auto y_acc = y.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); - auto y_ = sc.get_mem(y_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); + auto y_ = sc.get_mem(y_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_); + (rocDataType*)&alpha, x_, incx, y_, incy, a_); }); }); } #define SPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -704,20 +704,20 @@ SPR2_LAUNCHER(double, rocblas_dspr2) #undef SPR2_LAUNCHER template -inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -727,8 +727,8 @@ inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); \ } @@ -741,20 +741,20 @@ TBMV_LAUNCHER(std::complex, rocblas_ztbmv) #undef TBMV_LAUNCHER template -inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -764,8 +764,8 @@ inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TBSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); \ } @@ -778,19 +778,19 @@ TBSV_LAUNCHER(std::complex, rocblas_ztbsv) #undef TBSV_LAUNCHER template -inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, sycl::buffer &x, int64_t incx) { +inline void tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -800,8 +800,8 @@ inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx); \ } @@ -813,19 +813,19 @@ TPMV_LAUNCHER(std::complex, rocblas_ztpmv) #undef TPMV_LAUNCHER template -inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, sycl::buffer &x, int64_t incx) { +inline void tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -835,8 +835,8 @@ inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TPSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx); \ } @@ -848,20 +848,20 @@ TPSV_LAUNCHER(std::complex, rocblas_ztpsv) #undef TPSV_LAUNCHER template -inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -871,8 +871,8 @@ inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TRMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); \ } @@ -884,20 +884,20 @@ TRMV_LAUNCHER(std::complex, rocblas_ztrmv) #undef TRMV_LAUNCHER template -inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto x_acc = x.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto x_ = sc.get_mem(x_acc); + auto a_ = sc.get_mem(a_acc); + auto x_ = sc.get_mem(x_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -907,8 +907,8 @@ inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TRSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); \ } @@ -922,23 +922,23 @@ TRSV_LAUNCHER(std::complex, rocblas_ztrsv) // USM APIs template -inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -947,9 +947,9 @@ inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t } #define GEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -962,24 +962,24 @@ GEMV_LAUNCHER_USM(std::complex, rocblas_zgemv) #undef GEMV_LAUNCHER_USM template -inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, kl, ku, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(trans), m, n, kl, ku, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -988,10 +988,10 @@ inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t } #define GBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ - int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, \ - int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ + int64_t ku, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* x, \ + int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -1004,22 +1004,22 @@ GBMV_LAUNCHER_USM(std::complex, rocblas_zgbmv) #undef GBMV_LAUNCHER_USM template -inline sycl::event ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, const T *x, - int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event ger(Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, const T* x, + int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, m, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; - ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType *)&alpha, x_, incx, y_, + ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, m, n, (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -1028,9 +1028,9 @@ inline sycl::event ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T al } #define GER_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE) \ - sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return ger(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); \ } @@ -1044,23 +1044,23 @@ GER_LAUNCHER_USM(c, std::complex, rocblas_zgerc) #undef GER_LAUNCHER_USM template -inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1069,9 +1069,9 @@ inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -1082,23 +1082,23 @@ HBMV_LAUNCHER_USM(std::complex, rocblas_zhbmv) #undef HBMV_LAUNCHER_USM template -inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hemv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1107,9 +1107,9 @@ inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -1120,23 +1120,23 @@ HEMV_LAUNCHER_USM(std::complex, rocblas_zhemv) #undef HEMV_LAUNCHER_USM template -inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - const ScalarType alpha, const DataType *x, int64_t incx, DataType *a, - int64_t lda, const std::vector &dependencies) { +inline sycl::event her(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + const ScalarType alpha, const DataType* x, int64_t incx, DataType* a, + int64_t lda, const std::vector& dependencies) { using rocScalarType = typename RocEquivalentType::Type; using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocScalarType *)&alpha, x_, incx, a_, lda); + (rocScalarType*)&alpha, x_, incx, a_, lda); }); }); @@ -1144,9 +1144,9 @@ inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ } @@ -1156,23 +1156,23 @@ HER_LAUNCHER_USM(double, std::complex, rocblas_zher) #undef HER_LAUNCHER_USM template -inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event her2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda); + (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -1180,9 +1180,9 @@ inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HER2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -1193,23 +1193,23 @@ HER2_LAUNCHER_USM(std::complex, rocblas_zher2) #undef HER2_LAUNCHER_USM template -inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event hpmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_, + (rocDataType*)&alpha, a_, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1218,9 +1218,9 @@ inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ dependencies); \ } @@ -1231,23 +1231,23 @@ HPMV_LAUNCHER_USM(std::complex, rocblas_zhpmv) #undef HPMV_LAUNCHER_USM template -inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - const ScalarType alpha, const DataType *x, int64_t incx, DataType *a, - const std::vector &dependencies) { +inline sycl::event hpr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + const ScalarType alpha, const DataType* x, int64_t incx, DataType* a, + const std::vector& dependencies) { using rocScalarType = typename RocEquivalentType::Type; using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocScalarType *)&alpha, x_, incx, a_); + (rocScalarType*)&alpha, x_, incx, a_); }); }); @@ -1255,9 +1255,9 @@ inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, \ + const std::vector& dependencies) { \ return hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ } @@ -1267,23 +1267,23 @@ HPR_LAUNCHER_USM(double, std::complex, rocblas_zhpr) #undef HPR_LAUNCHER_USM template -inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, - const std::vector &dependencies) { +inline sycl::event hpr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_); + (rocDataType*)&alpha, x_, incx, y_, incy, a_); }); }); @@ -1291,9 +1291,9 @@ inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ dependencies); \ } @@ -1304,23 +1304,23 @@ HPR2_LAUNCHER_USM(std::complex, rocblas_zhpr2) #undef HPR2_LAUNCHER_USM template -inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event sbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, k, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1329,9 +1329,9 @@ inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -1342,23 +1342,23 @@ SBMV_LAUNCHER_USM(double, rocblas_dsbmv) #undef SBMV_LAUNCHER_USM template -inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event symv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, lda, x_, incx, (rocDataType *)&beta, + (rocDataType*)&alpha, a_, lda, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1367,9 +1367,9 @@ inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -1380,22 +1380,22 @@ SYMV_LAUNCHER_USM(double, rocblas_dsymv) #undef SYMV_LAUNCHER_USM template -inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, T* a, int64_t lda, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, a_, lda); + (rocDataType*)&alpha, x_, incx, a_, lda); }); }); @@ -1403,9 +1403,9 @@ inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ } @@ -1418,23 +1418,23 @@ SYR_LAUNCHER_USM(std::complex, rocblas_zsyr) #undef SYR_LAUNCHER_USM template -inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_, lda); + (rocDataType*)&alpha, x_, incx, y_, incy, a_, lda); }); }); @@ -1442,9 +1442,9 @@ inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -1458,23 +1458,23 @@ SYR2_LAUNCHER_USM(std::complex, rocblas_zsyr2) #undef SYR2_LAUNCHER_USM template -inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event spmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, a_, x_, incx, (rocDataType *)&beta, y_, + (rocDataType*)&alpha, a_, x_, incx, (rocDataType*)&beta, y_, incy); }); }); @@ -1483,9 +1483,9 @@ inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ dependencies); \ } @@ -1496,21 +1496,21 @@ SPMV_LAUNCHER_USM(double, rocblas_dspmv) #undef SPMV_LAUNCHER_USM template -inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, T *a, - const std::vector &dependencies) { +inline sycl::event spr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, T* a, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, a_); + (rocDataType*)&alpha, x_, incx, a_); }); }); @@ -1518,8 +1518,8 @@ inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, const std::vector &dependencies) { \ + sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, const std::vector& dependencies) { \ return spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ } @@ -1529,23 +1529,23 @@ SPR_LAUNCHER_USM(double, rocblas_dspr) #undef SPR_LAUNCHER_USM template -inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, - const std::vector &dependencies) { +inline sycl::event spr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx, incy); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); - auto y_ = reinterpret_cast(y); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); + auto y_ = reinterpret_cast(y); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), n, - (rocDataType *)&alpha, x_, incx, y_, incy, a_); + (rocDataType*)&alpha, x_, incx, y_, incy, a_); }); }); @@ -1553,9 +1553,9 @@ inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ dependencies); \ } @@ -1566,19 +1566,19 @@ SPR2_LAUNCHER_USM(double, rocblas_dspr2) #undef SPR2_LAUNCHER_USM template -inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1590,9 +1590,9 @@ inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ dependencies); \ } @@ -1605,19 +1605,19 @@ TBMV_LAUNCHER_USM(std::complex, rocblas_ztbmv) #undef TBMV_LAUNCHER_USM template -inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1629,9 +1629,9 @@ inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TBSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ dependencies); \ } @@ -1644,19 +1644,19 @@ TBSV_LAUNCHER_USM(std::complex, rocblas_ztbsv) #undef TBSV_LAUNCHER_USM template -inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1668,9 +1668,9 @@ inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ dependencies); \ } @@ -1683,19 +1683,19 @@ TPMV_LAUNCHER_USM(std::complex, rocblas_ztpmv) #undef TPMV_LAUNCHER_USM template -inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1707,9 +1707,9 @@ inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TPSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ dependencies); \ } @@ -1722,19 +1722,19 @@ TPSV_LAUNCHER_USM(std::complex, rocblas_ztpsv) #undef TPSV_LAUNCHER_USM template -inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1746,9 +1746,9 @@ inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TRMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ dependencies); \ } @@ -1761,19 +1761,19 @@ TRMV_LAUNCHER_USM(std::complex, rocblas_ztrmv) #undef TRMV_LAUNCHER_USM template -inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, lda, incx); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto x_ = reinterpret_cast(x); + auto a_ = reinterpret_cast(a); + auto x_ = reinterpret_cast(x); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), @@ -1785,9 +1785,9 @@ inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TRSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ dependencies); \ } @@ -1806,10 +1806,10 @@ namespace row_major { // Buffer APIs template -inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx, std::complex beta, - sycl::buffer, 1> &y, int64_t incy) { +inline void gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx, std::complex beta, + sycl::buffer, 1>& y, int64_t incy) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1818,10 +1818,10 @@ inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 beta = std::conj(beta); if (m > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, m, incx); }); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } @@ -1830,15 +1830,15 @@ inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } template -inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1846,9 +1846,9 @@ inline void gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 } #define GEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -1860,10 +1860,10 @@ GEMV_LAUNCHER(std::complex, rocblas_zgemv) #undef GEMV_LAUNCHER template -inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, std::complex alpha, sycl::buffer, 1> &a, - int64_t lda, sycl::buffer, 1> &x, int64_t incx, - std::complex beta, sycl::buffer, 1> &y, int64_t incy) { +inline void gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, std::complex alpha, sycl::buffer, 1>& a, + int64_t lda, sycl::buffer, 1>& x, int64_t incx, + std::complex beta, sycl::buffer, 1>& y, int64_t incy) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1872,10 +1872,10 @@ inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 beta = std::conj(beta); if (m > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, m, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, m, incx); }); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } @@ -1884,15 +1884,15 @@ inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } template -inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, - int64_t ku, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, - int64_t incx, T beta, sycl::buffer &y, int64_t incy) { +inline void gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, + int64_t ku, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, + int64_t incx, T beta, sycl::buffer& y, int64_t incy) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -1900,9 +1900,9 @@ inline void gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int6 } #define GBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ - int64_t incx, TYPE beta, sycl::buffer &y, int64_t incy) { \ + void gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, int64_t ku, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ + int64_t incx, TYPE beta, sycl::buffer& y, int64_t incy) { \ gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -1914,35 +1914,35 @@ GBMV_LAUNCHER(std::complex, rocblas_zgbmv) #undef GBMV_LAUNCHER template -inline void gerc(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { +inline void gerc(Func func, sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda); } template -inline void geru(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - sycl::buffer, 1> &x, int64_t incx, - sycl::buffer, 1> &y, int64_t incy, - sycl::buffer, 1> &a, int64_t lda) { +inline void geru(Func func, sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + sycl::buffer, 1>& x, int64_t incx, + sycl::buffer, 1>& y, int64_t incy, + sycl::buffer, 1>& a, int64_t lda) { column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda); } template -inline void ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, sycl::buffer &x, - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, +inline void ger(Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, sycl::buffer& x, + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, int64_t lda) { column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda); } #define GER_LAUNCHER(EXT, TYPE, ROCBLAS_ROUTINE) \ - void ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer &x, \ - int64_t incx, sycl::buffer &y, int64_t incy, sycl::buffer &a, \ + void ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, sycl::buffer& x, \ + int64_t incx, sycl::buffer& y, int64_t incy, sycl::buffer& a, \ int64_t lda) { \ ger##EXT(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda); \ } @@ -1957,29 +1957,29 @@ GER_LAUNCHER(c, std::complex, rocblas_zgeru) #undef GER_LAUNCHER template -inline void hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_alpha = std::conj(alpha); auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, y, n, incx, incy); }); } column_major::hbmv(func, queue, new_uplo, n, k, new_alpha, a, lda, x, incx, new_beta, y, incy); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } #define HBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -1989,29 +1989,29 @@ HBMV_LAUNCHER(std::complex, rocblas_zhbmv) #undef HBMV_LAUNCHER template -inline void hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hemv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_alpha = std::conj(alpha); auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, y, n, incx, incy); }); } column_major::hemv(func, queue, new_uplo, n, new_alpha, a, lda, x, incx, new_beta, y, incy); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } #define HEMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -2021,22 +2021,22 @@ HEMV_LAUNCHER(std::complex, rocblas_zhemv) #undef HEMV_LAUNCHER template -inline void her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, +inline void her(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, ScalarType alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } column_major::her(func, queue, new_uplo, n, alpha, x, incx, a, lda); } #define HER_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - void her(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, \ + void her(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, \ int64_t lda) { \ her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -2047,23 +2047,23 @@ HER_LAUNCHER(double, std::complex, rocblas_zher) #undef HER_LAUNCHER template -inline void her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void her2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, y, n, incx, incy); }); } column_major::her2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a, lda); } #define HER2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); \ } @@ -2073,29 +2073,29 @@ HER2_LAUNCHER(std::complex, rocblas_zher2) #undef HER2_LAUNCHER template -inline void hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void hpmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_alpha = std::conj(alpha); auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, y, n, incx, incy); }); } column_major::hpmv(func, queue, new_uplo, n, new_alpha, a, x, incx, new_beta, y, incy); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } #define HPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); \ } @@ -2105,21 +2105,21 @@ HPMV_LAUNCHER(std::complex, rocblas_zhpmv) #undef HPMV_LAUNCHER template -inline void hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, ScalarType alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void hpr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, ScalarType alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } column_major::hpr(func, queue, new_uplo, n, alpha, x, incx, a); } #define HPR_LAUNCHER(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - void hpr(sycl::queue &queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void hpr(sycl::queue& queue, uplo upper_lower, int64_t n, SCALAR_TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -2129,23 +2129,23 @@ HPR_LAUNCHER(double, std::complex, rocblas_zhpr) #undef HPR_LAUNCHER template -inline void hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void hpr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, y, n, incx, incy); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, y, n, incx, incy); }); } column_major::hpr2(func, queue, new_uplo, n, alpha, y, incy, x, incx, a); } #define HPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -2155,9 +2155,9 @@ HPR2_LAUNCHER(std::complex, rocblas_zhpr2) #undef HPR2_LAUNCHER template -inline void sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void sbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2165,9 +2165,9 @@ inline void sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int } #define SBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -2177,9 +2177,9 @@ SBMV_LAUNCHER(double, rocblas_dsbmv) #undef SBMV_LAUNCHER template -inline void symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void symv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2187,9 +2187,9 @@ inline void symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T a } #define SYMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx, \ - TYPE beta, sycl::buffer &y, int64_t incy) { \ + void symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx, \ + TYPE beta, sycl::buffer& y, int64_t incy) { \ symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); \ } @@ -2199,8 +2199,8 @@ SYMV_LAUNCHER(double, rocblas_dsymv) #undef SYMV_LAUNCHER template -inline void syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { +inline void syr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2208,8 +2208,8 @@ inline void syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T al } #define SYR_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a, int64_t lda) { \ + void syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a, int64_t lda) { \ syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda); \ } @@ -2222,9 +2222,9 @@ SYR_LAUNCHER(std::complex, rocblas_zsyr) #undef SYR_LAUNCHER template -inline void syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a, int64_t lda) { +inline void syr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a, int64_t lda) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2232,9 +2232,9 @@ inline void syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T a } #define SYR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a, int64_t lda) { \ + void syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a, int64_t lda) { \ syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); \ } @@ -2247,9 +2247,9 @@ SYR2_LAUNCHER(std::complex, rocblas_zsyr2) #undef SYR2_LAUNCHER template -inline void spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &a, sycl::buffer &x, int64_t incx, T beta, - sycl::buffer &y, int64_t incy) { +inline void spmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& a, sycl::buffer& x, int64_t incx, T beta, + sycl::buffer& y, int64_t incy) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2257,9 +2257,9 @@ inline void spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T a } #define SPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx, TYPE beta, \ - sycl::buffer &y, int64_t incy) { \ + void spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx, TYPE beta, \ + sycl::buffer& y, int64_t incy) { \ spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); \ } @@ -2269,8 +2269,8 @@ SPMV_LAUNCHER(double, rocblas_dspmv) #undef SPMV_LAUNCHER template -inline void spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &a) { +inline void spr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& a) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2278,8 +2278,8 @@ inline void spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T al } #define SPR_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &a) { \ + void spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& a) { \ spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a); \ } @@ -2289,9 +2289,9 @@ SPR_LAUNCHER(double, rocblas_dspr) #undef SPR_LAUNCHER template -inline void spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, - sycl::buffer &a) { +inline void spr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, + sycl::buffer& a) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2299,9 +2299,9 @@ inline void spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T a } #define SPR2_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, \ - sycl::buffer &x, int64_t incx, sycl::buffer &y, int64_t incy, \ - sycl::buffer &a) { \ + void spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, \ + sycl::buffer& x, int64_t incx, sycl::buffer& y, int64_t incy, \ + sycl::buffer& a) { \ spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a); \ } @@ -2311,9 +2311,9 @@ SPR2_LAUNCHER(double, rocblas_dspr2) #undef SPR2_LAUNCHER template -inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { +inline void tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2321,7 +2321,7 @@ inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2329,14 +2329,14 @@ inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2347,8 +2347,8 @@ inline void tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TBMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); \ } @@ -2361,9 +2361,9 @@ TBMV_LAUNCHER(std::complex, rocblas_ztbmv) #undef TBMV_LAUNCHER template -inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { +inline void tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2371,7 +2371,7 @@ inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2379,14 +2379,14 @@ inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2397,8 +2397,8 @@ inline void tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TBSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - int64_t k, sycl::buffer &a, int64_t lda, sycl::buffer &x, \ + void tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + int64_t k, sycl::buffer& a, int64_t lda, sycl::buffer& x, \ int64_t incx) { \ tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); \ } @@ -2411,9 +2411,9 @@ TBSV_LAUNCHER(std::complex, rocblas_ztbsv) #undef TBSV_LAUNCHER template -inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { +inline void tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2421,7 +2421,7 @@ inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2429,14 +2429,14 @@ inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, sycl::buffer &x, int64_t incx) { +inline void tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2446,8 +2446,8 @@ inline void tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TPMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx); \ } @@ -2459,9 +2459,9 @@ TPMV_LAUNCHER(std::complex, rocblas_ztpmv) #undef TPMV_LAUNCHER template -inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, int64_t incx) { +inline void tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2469,7 +2469,7 @@ inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2477,14 +2477,14 @@ inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, sycl::buffer &x, int64_t incx) { +inline void tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2494,8 +2494,8 @@ inline void tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TPSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, sycl::buffer &x, int64_t incx) { \ + void tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, sycl::buffer& x, int64_t incx) { \ tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx); \ } @@ -2507,9 +2507,9 @@ TPSV_LAUNCHER(std::complex, rocblas_ztpsv) #undef TPSV_LAUNCHER template -inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { +inline void trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2517,7 +2517,7 @@ inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2525,14 +2525,14 @@ inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2543,8 +2543,8 @@ inline void trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TRMV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); \ } @@ -2556,9 +2556,9 @@ TRMV_LAUNCHER(std::complex, rocblas_ztrmv) #undef TRMV_LAUNCHER template -inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer, 1> &a, int64_t lda, - sycl::buffer, 1> &x, int64_t incx) { +inline void trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer, 1>& a, int64_t lda, + sycl::buffer, 1>& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2566,7 +2566,7 @@ inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } @@ -2574,14 +2574,14 @@ inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }); + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }); } } } template -inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, - int64_t n, sycl::buffer &a, int64_t lda, sycl::buffer &x, +inline void trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, + int64_t n, sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -2592,8 +2592,8 @@ inline void trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define TRSV_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ - sycl::buffer &a, int64_t lda, sycl::buffer &x, int64_t incx) { \ + void trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, int64_t n, \ + sycl::buffer& a, int64_t lda, sycl::buffer& x, int64_t incx) { \ trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); \ } @@ -2607,11 +2607,11 @@ TRSV_LAUNCHER(std::complex, rocblas_ztrsv) // USM APIs template -inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { sycl::event done; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2623,10 +2623,10 @@ inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t if (m > 0) { done = queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, (std::complex *)x, m, incx); }); + [&](sycl::handler& cgh) { conj_vector(cgh, (std::complex*)x, m, incx); }); if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + done = queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } @@ -2638,7 +2638,7 @@ inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy); }); @@ -2649,9 +2649,9 @@ inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t } template -inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event gemv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -2660,9 +2660,9 @@ inline sycl::event gemv(Func func, sycl::queue &queue, transpose trans, int64_t } #define GEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event gemv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return gemv(ROCBLAS_ROUTINE, queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -2675,11 +2675,11 @@ GEMV_LAUNCHER_USM(std::complex, rocblas_zgemv) #undef GEMV_LAUNCHER_USM template -inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, std::complex alpha, const std::complex *a, - int64_t lda, const std::complex *x, int64_t incx, std::complex beta, - std::complex *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, std::complex alpha, const std::complex* a, + int64_t lda, const std::complex* x, int64_t incx, std::complex beta, + std::complex* y, int64_t incy, + const std::vector& dependencies) { sycl::event done; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -2691,10 +2691,10 @@ inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t if (m > 0) { done = queue.submit( - [&](sycl::handler &cgh) { conj_vector(cgh, (std::complex *)x, m, incx); }); + [&](sycl::handler& cgh) { conj_vector(cgh, (std::complex*)x, m, incx); }); if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, y, n, incy); }); + done = queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, y, n, incy); }); } } } @@ -2706,7 +2706,7 @@ inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy); }); @@ -2717,10 +2717,10 @@ inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t } template -inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t m, int64_t n, - int64_t kl, int64_t ku, T alpha, const T *a, int64_t lda, const T *x, - int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event gbmv(Func func, sycl::queue& queue, transpose trans, int64_t m, int64_t n, + int64_t kl, int64_t ku, T alpha, const T* a, int64_t lda, const T* x, + int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans : oneapi::mkl::transpose::nontrans; @@ -2729,10 +2729,10 @@ inline sycl::event gbmv(Func func, sycl::queue &queue, transpose trans, int64_t } #define GBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gbmv(sycl::queue &queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ - int64_t ku, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *x, \ - int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event gbmv(sycl::queue& queue, transpose trans, int64_t m, int64_t n, int64_t kl, \ + int64_t ku, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* x, \ + int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return gbmv(ROCBLAS_ROUTINE, queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -2745,12 +2745,12 @@ GBMV_LAUNCHER_USM(std::complex, rocblas_zgbmv) #undef GBMV_LAUNCHER_USM template -inline sycl::event gerc(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event gerc(Func func, sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (std::complex *)y, n, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (std::complex*)y, n, incy); }) .wait_and_throw(); } @@ -2758,24 +2758,24 @@ inline sycl::event gerc(Func func, sycl::queue &queue, int64_t m, int64_t n, std } template -inline sycl::event geru(Func func, sycl::queue &queue, int64_t m, int64_t n, std::complex alpha, - const std::complex *x, int64_t incx, const std::complex *y, - int64_t incy, std::complex *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event geru(Func func, sycl::queue& queue, int64_t m, int64_t n, std::complex alpha, + const std::complex* x, int64_t incx, const std::complex* y, + int64_t incy, std::complex* a, int64_t lda, + const std::vector& dependencies) { return column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda, dependencies); } template -inline sycl::event ger(Func func, sycl::queue &queue, int64_t m, int64_t n, T alpha, const T *x, - int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event ger(Func func, sycl::queue& queue, int64_t m, int64_t n, T alpha, const T* x, + int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { return column_major::ger(func, queue, n, m, alpha, y, incy, x, incx, a, lda, dependencies); } #define GER_LAUNCHER_USM(EXT, TYPE, ROCBLAS_ROUTINE) \ - sycl::event ger##EXT(sycl::queue &queue, int64_t m, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event ger##EXT(sycl::queue& queue, int64_t m, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return ger##EXT(ROCBLAS_ROUTINE, queue, m, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -2790,9 +2790,9 @@ GER_LAUNCHER_USM(c, std::complex, rocblas_zgeru) #undef GER_LAUNCHER_USM template -inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -2801,7 +2801,7 @@ inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (T*)x, y, n, incx, incy); }) .wait_and_throw(); } @@ -2809,7 +2809,7 @@ inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t incy, dependencies); if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy); }); @@ -2819,9 +2819,9 @@ inline sycl::event hbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event hbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return hbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -2832,9 +2832,9 @@ HBMV_LAUNCHER_USM(std::complex, rocblas_zhbmv) #undef HBMV_LAUNCHER_USM template -inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event hemv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -2843,7 +2843,7 @@ inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (T*)x, y, n, incx, incy); }) .wait_and_throw(); } @@ -2851,7 +2851,7 @@ inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t incy, dependencies); if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy); }); @@ -2861,9 +2861,9 @@ inline sycl::event hemv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HEMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hemv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hemv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hemv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -2874,14 +2874,14 @@ HEMV_LAUNCHER_USM(std::complex, rocblas_zhemv) #undef HEMV_LAUNCHER_USM template -inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - const ScalarType alpha, const DataType *x, int64_t incx, DataType *a, - int64_t lda, const std::vector &dependencies) { +inline sycl::event her(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + const ScalarType alpha, const DataType* x, int64_t incx, DataType* a, + int64_t lda, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (DataType *)x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (DataType*)x, n, incx); }) .wait_and_throw(); } @@ -2889,9 +2889,9 @@ inline sycl::event her(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HER_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - sycl::event her(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ } @@ -2901,14 +2901,14 @@ HER_LAUNCHER_USM(double, std::complex, rocblas_zher) #undef HER_LAUNCHER_USM template -inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event her2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, (T *)y, n, incx, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (T*)x, (T*)y, n, incx, incy); }) .wait_and_throw(); } @@ -2917,9 +2917,9 @@ inline sycl::event her2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HER2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event her2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event her2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return her2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -2930,9 +2930,9 @@ HER2_LAUNCHER_USM(std::complex, rocblas_zher2) #undef HER2_LAUNCHER_USM template -inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event hpmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -2941,7 +2941,7 @@ inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t auto new_beta = std::conj(beta); if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, y, n, incx, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (T*)x, y, n, incx, incy); }) .wait_and_throw(); } @@ -2949,7 +2949,7 @@ inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t dependencies); if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, y, n, incy); }); @@ -2959,9 +2959,9 @@ inline sycl::event hpmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event hpmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return hpmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ dependencies); \ } @@ -2972,14 +2972,14 @@ HPMV_LAUNCHER_USM(std::complex, rocblas_zhpmv) #undef HPMV_LAUNCHER_USM template -inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, - const ScalarType alpha, const DataType *x, int64_t incx, DataType *a, - const std::vector &dependencies) { +inline sycl::event hpr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, + const ScalarType alpha, const DataType* x, int64_t incx, DataType* a, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (DataType *)x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (DataType*)x, n, incx); }) .wait_and_throw(); } @@ -2987,9 +2987,9 @@ inline sycl::event hpr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPR_LAUNCHER_USM(SCALAR_TYPE, DATA_TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpr(sycl::queue &queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ - const DATA_TYPE *x, int64_t incx, DATA_TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr(sycl::queue& queue, uplo upper_lower, int64_t n, const SCALAR_TYPE alpha, \ + const DATA_TYPE* x, int64_t incx, DATA_TYPE* a, \ + const std::vector& dependencies) { \ return hpr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ } @@ -2999,14 +2999,14 @@ HPR_LAUNCHER_USM(double, std::complex, rocblas_zhpr) #undef HPR_LAUNCHER_USM template -inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, - const std::vector &dependencies) { +inline sycl::event hpr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, (T *)x, (T *)y, n, incx, incy); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, (T*)x, (T*)y, n, incx, incy); }) .wait_and_throw(); } @@ -3014,9 +3014,9 @@ inline sycl::event hpr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define HPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hpr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event hpr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return hpr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ dependencies); \ } @@ -3027,9 +3027,9 @@ HPR2_LAUNCHER_USM(std::complex, rocblas_zhpr2) #undef HPR2_LAUNCHER_USM template -inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, - T alpha, const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event sbmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, + T alpha, const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3038,9 +3038,9 @@ inline sycl::event sbmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event sbmv(sycl::queue &queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ - const TYPE *a, int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, \ - int64_t incy, const std::vector &dependencies) { \ + sycl::event sbmv(sycl::queue& queue, uplo upper_lower, int64_t n, int64_t k, TYPE alpha, \ + const TYPE* a, int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, \ + int64_t incy, const std::vector& dependencies) { \ return sbmv(ROCBLAS_ROUTINE, queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, \ incy, dependencies); \ } @@ -3051,9 +3051,9 @@ SBMV_LAUNCHER_USM(double, rocblas_dsbmv) #undef SBMV_LAUNCHER_USM template -inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, int64_t lda, const T *x, int64_t incx, T beta, T *y, - int64_t incy, const std::vector &dependencies) { +inline sycl::event symv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, int64_t lda, const T* x, int64_t incx, T beta, T* y, + int64_t incy, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3062,9 +3062,9 @@ inline sycl::event symv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event symv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - int64_t lda, const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event symv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + int64_t lda, const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return symv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, \ dependencies); \ } @@ -3075,9 +3075,9 @@ SYMV_LAUNCHER_USM(double, rocblas_dsymv) #undef SYMV_LAUNCHER_USM template -inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, T* a, int64_t lda, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3085,9 +3085,9 @@ inline sycl::event syr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); \ } @@ -3100,9 +3100,9 @@ SYR_LAUNCHER_USM(std::complex, rocblas_zsyr) #undef SYR_LAUNCHER_USM template -inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, int64_t lda, - const std::vector &dependencies) { +inline sycl::event syr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, int64_t lda, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3111,9 +3111,9 @@ inline sycl::event syr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SYR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, int64_t lda, \ - const std::vector &dependencies) { \ + sycl::event syr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, int64_t lda, \ + const std::vector& dependencies) { \ return syr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, \ dependencies); \ } @@ -3127,9 +3127,9 @@ SYR2_LAUNCHER_USM(std::complex, rocblas_zsyr2) #undef SYR2_LAUNCHER_USM template -inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *a, const T *x, int64_t incx, T beta, T *y, int64_t incy, - const std::vector &dependencies) { +inline sycl::event spmv(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* a, const T* x, int64_t incx, T beta, T* y, int64_t incy, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3138,9 +3138,9 @@ inline sycl::event spmv(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spmv(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *a, \ - const TYPE *x, int64_t incx, TYPE beta, TYPE *y, int64_t incy, \ - const std::vector &dependencies) { \ + sycl::event spmv(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* a, \ + const TYPE* x, int64_t incx, TYPE beta, TYPE* y, int64_t incy, \ + const std::vector& dependencies) { \ return spmv(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, \ dependencies); \ } @@ -3151,9 +3151,9 @@ SPMV_LAUNCHER_USM(double, rocblas_dspmv) #undef SPMV_LAUNCHER_USM template -inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, T *a, - const std::vector &dependencies) { +inline sycl::event spr(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, T* a, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3161,8 +3161,8 @@ inline sycl::event spr(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPR_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spr(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, TYPE *a, const std::vector &dependencies) { \ + sycl::event spr(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, TYPE* a, const std::vector& dependencies) { \ return spr(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, a, dependencies); \ } @@ -3172,9 +3172,9 @@ SPR_LAUNCHER_USM(double, rocblas_dspr) #undef SPR_LAUNCHER_USM template -inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t n, T alpha, - const T *x, int64_t incx, const T *y, int64_t incy, T *a, - const std::vector &dependencies) { +inline sycl::event spr2(Func func, sycl::queue& queue, uplo upper_lower, int64_t n, T alpha, + const T* x, int64_t incx, const T* y, int64_t incy, T* a, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; @@ -3182,9 +3182,9 @@ inline sycl::event spr2(Func func, sycl::queue &queue, uplo upper_lower, int64_t } #define SPR2_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event spr2(sycl::queue &queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE *x, \ - int64_t incx, const TYPE *y, int64_t incy, TYPE *a, \ - const std::vector &dependencies) { \ + sycl::event spr2(sycl::queue& queue, uplo upper_lower, int64_t n, TYPE alpha, const TYPE* x, \ + int64_t incx, const TYPE* y, int64_t incy, TYPE* a, \ + const std::vector& dependencies) { \ return spr2(ROCBLAS_ROUTINE, queue, upper_lower, n, alpha, x, incx, y, incy, a, \ dependencies); \ } @@ -3195,10 +3195,10 @@ SPR2_LAUNCHER_USM(double, rocblas_dspr2) #undef SPR2_LAUNCHER_USM template -inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3208,7 +3208,7 @@ inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3218,7 +3218,7 @@ inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3229,9 +3229,9 @@ inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tbmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3242,9 +3242,9 @@ inline sycl::event tbmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TBMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tbmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ dependencies); \ } @@ -3257,10 +3257,10 @@ TBMV_LAUNCHER_USM(std::complex, rocblas_ztbmv) #undef TBMV_LAUNCHER_USM template -inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3270,7 +3270,7 @@ inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3280,7 +3280,7 @@ inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3291,9 +3291,9 @@ inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, int64_t k, const T *a, int64_t lda, T *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tbsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, int64_t k, const T* a, int64_t lda, T* x, + int64_t incx, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3304,9 +3304,9 @@ inline sycl::event tbsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TBSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tbsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, int64_t k, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tbsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, int64_t k, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tbsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, \ dependencies); \ } @@ -3319,9 +3319,9 @@ TBSV_LAUNCHER_USM(std::complex, rocblas_ztbsv) #undef TBSV_LAUNCHER_USM template -inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3331,7 +3331,7 @@ inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3343,7 +3343,7 @@ inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (n > 0) { incx = std::abs(incx); - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3354,9 +3354,9 @@ inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3367,9 +3367,9 @@ inline sycl::event tpmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TPMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tpmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ dependencies); \ } @@ -3382,9 +3382,9 @@ TPMV_LAUNCHER_USM(std::complex, rocblas_ztpmv) #undef TPMV_LAUNCHER_USM template -inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, std::complex *x, - int64_t incx, const std::vector &dependencies) { +inline sycl::event tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const std::complex* a, std::complex* x, + int64_t incx, const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3394,7 +3394,7 @@ inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3406,7 +3406,7 @@ inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (n > 0) { incx = std::abs(incx); - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3417,9 +3417,9 @@ inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event tpsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, T* x, int64_t incx, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3430,9 +3430,9 @@ inline sycl::event tpsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TPSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event tpsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event tpsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return tpsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, x, incx, \ dependencies); \ } @@ -3445,10 +3445,10 @@ TPSV_LAUNCHER_USM(std::complex, rocblas_ztpsv) #undef TPSV_LAUNCHER_USM template -inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3458,7 +3458,7 @@ inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3468,7 +3468,7 @@ inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3479,9 +3479,9 @@ inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trmv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3492,9 +3492,9 @@ inline sycl::event trmv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TRMV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trmv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trmv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trmv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ dependencies); \ } @@ -3507,10 +3507,10 @@ TRMV_LAUNCHER_USM(std::complex, rocblas_ztrmv) #undef TRMV_LAUNCHER_USM template -inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const std::complex *a, int64_t lda, - std::complex *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const std::complex* a, int64_t lda, + std::complex* x, int64_t incx, + const std::vector& dependencies) { sycl::event done; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -3520,7 +3520,7 @@ inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - queue.submit([&](sycl::handler &cgh) { conj_vector(cgh, x, n, incx); }) + queue.submit([&](sycl::handler& cgh) { conj_vector(cgh, x, n, incx); }) .wait_and_throw(); } } @@ -3530,7 +3530,7 @@ inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpo if (trans == oneapi::mkl::transpose::conjtrans) { if (n > 0) { - done = queue.submit([&](sycl::handler &cgh) { + done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); conj_vector(cgh, x, n, incx); }); @@ -3541,9 +3541,9 @@ inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } template -inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, int64_t n, const T *a, int64_t lda, T *x, int64_t incx, - const std::vector &dependencies) { +inline sycl::event trsv(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, int64_t n, const T* a, int64_t lda, T* x, int64_t incx, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -3554,9 +3554,9 @@ inline sycl::event trsv(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define TRSV_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsv(sycl::queue &queue, uplo upper_lower, transpose trans, diag unit_diag, \ - int64_t n, const TYPE *a, int64_t lda, TYPE *x, int64_t incx, \ - const std::vector &dependencies) { \ + sycl::event trsv(sycl::queue& queue, uplo upper_lower, transpose trans, diag unit_diag, \ + int64_t n, const TYPE* a, int64_t lda, TYPE* x, int64_t incx, \ + const std::vector& dependencies) { \ return trsv(ROCBLAS_ROUTINE, queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, \ dependencies); \ } diff --git a/src/blas/backends/rocblas/rocblas_level3.cpp b/src/blas/backends/rocblas/rocblas_level3.cpp index ef739a88b..a525ad098 100644 --- a/src/blas/backends/rocblas/rocblas_level3.cpp +++ b/src/blas/backends/rocblas/rocblas_level3.cpp @@ -34,34 +34,34 @@ namespace column_major { // Buffer APIs template -inline void gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void gemm(Func func, sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, k, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(transb), m, n, k, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); } #define GEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE beta, sycl::buffer& c, \ int64_t ldc) { \ gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ @@ -77,9 +77,9 @@ GEMM_LAUNCHER(std::complex, rocblas_zgemm) template inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, COMPUTETYPE CT, - sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, T_S alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, T_S beta, sycl::buffer &c, + sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, T_S alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, T_S beta, sycl::buffer& c, int64_t ldc) { using rocDataType_A = typename RocEquivalentType::Type; using rocDataType_B = typename RocEquivalentType::Type; @@ -87,20 +87,20 @@ inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C using rocDataType_S = typename RocEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, k, (rocDataType_S *)&alpha, - a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S *)&beta, c_, DT_C, + get_rocblas_operation(transb), m, n, k, (rocDataType_S*)&alpha, + a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S*)&beta, c_, DT_C, ldc, c_, DT_C, ldc, CT, rocblas_gemm_algo_standard, 0, 0); }); }); @@ -108,9 +108,9 @@ inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C #define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A, \ ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE_S beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE_S beta, sycl::buffer& c, \ int64_t ldc) { \ gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE, \ queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); \ @@ -130,34 +130,34 @@ GEMM_EX_LAUNCHER(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_d #undef GEMM_EX_LAUNCHER template -inline void symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void symm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), - get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_fill_mode(upper_lower), m, n, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); } #define SYMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \ c, ldc); \ } @@ -170,34 +170,34 @@ SYMM_LAUNCHER(std::complex, rocblas_zsymm) #undef SYMM_LAUNCHER template -inline void hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void hemm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), - get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_fill_mode(upper_lower), m, n, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); } #define HEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \ c, ldc); \ } @@ -208,32 +208,32 @@ HEMM_LAUNCHER(std::complex, rocblas_zhemm) #undef HEMM_LAUNCHER template -inline void syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, T beta, - sycl::buffer &c, int64_t ldc) { +inline void syrk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, T beta, + sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, (rocDataType*)&beta, c_, ldc); }); }); } #define SYRK_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \ } @@ -245,33 +245,33 @@ SYRK_LAUNCHER(std::complex, rocblas_zsyrk) #undef SYRK_LAUNCHER template -inline void herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, ScalarType alpha, sycl::buffer &a, int64_t lda, - ScalarType beta, sycl::buffer &c, int64_t ldc) { +inline void herk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, ScalarType alpha, sycl::buffer& a, int64_t lda, + ScalarType beta, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; using rocScalarType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocScalarType *)&alpha, a_, - lda, (rocScalarType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocScalarType*)&alpha, a_, + lda, (rocScalarType*)&beta, c_, ldc); }); }); } #define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - SCALAR_TYPE alpha, sycl::buffer &a, int64_t lda, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + SCALAR_TYPE alpha, sycl::buffer& a, int64_t lda, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \ } @@ -281,34 +281,34 @@ HERK_LAUNCHER(std::complex, double, rocblas_zherk) #undef HERK_LAUNCHER template -inline void syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void syr2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); } #define SYR2K_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ } @@ -321,37 +321,37 @@ SYR2K_LAUNCHER(std::complex, rocblas_zsyr2k) #undef SYR2K_LAUNCHER template -inline void her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, DataType alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, ScalarType beta, - sycl::buffer &c, int64_t ldc) { +inline void her2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, DataType alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, ScalarType beta, + sycl::buffer& c, int64_t ldc) { using rocDataType = typename RocEquivalentType::Type; using rocScalarType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, b_, ldb, (rocScalarType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, b_, ldb, (rocScalarType*)&beta, c_, ldc); }); }); } #define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - DATA_TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + DATA_TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ } @@ -366,26 +366,26 @@ HER2K_LAUNCHER(std::complex, double, rocblas_zher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { +inline void trmm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; #if ROCBLAS_VERSION_MAJOR >= 4 ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb); + m, n, (rocDataType*)&alpha, a_, lda, b_, ldb, b_, ldb); #else ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), @@ -397,9 +397,9 @@ inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe } #define TRMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \ lda, b, ldb); \ } @@ -412,33 +412,33 @@ TRMM_LAUNCHER(std::complex, rocblas_ztrmm) #undef TRMM_LAUNCHER template -inline void trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { +inline void trsm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, b_, ldb); + m, n, (rocDataType*)&alpha, a_, lda, b_, ldb); }); }); } #define TRSM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \ lda, b, ldb); \ } @@ -453,25 +453,25 @@ TRSM_LAUNCHER(std::complex, rocblas_ztrsm) // USM APIs template -inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event gemm(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, k, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(transb), m, n, k, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); @@ -479,10 +479,10 @@ inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpo } #define GEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ - int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, \ + int64_t ldb, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, \ c, ldc, dependencies); \ } @@ -497,28 +497,28 @@ GEMM_LAUNCHER_USM(std::complex, rocblas_zgemm) template inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, - COMPUTETYPE CT, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, T_S alpha, const T_A *a, int64_t lda, - const T_B *b, int64_t ldb, T_S beta, T_C *c, int64_t ldc, - const std::vector &dependencies) { + COMPUTETYPE CT, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, int64_t k, T_S alpha, const T_A* a, int64_t lda, + const T_B* b, int64_t ldb, T_S beta, T_C* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType_A = typename RocEquivalentType::Type; using rocDataType_B = typename RocEquivalentType::Type; using rocDataType_C = typename RocEquivalentType::Type; using rocDataType_S = typename RocEquivalentType::Type; overflow_check(m, n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_operation(transa), - get_rocblas_operation(transb), m, n, k, (rocDataType_S *)&alpha, - a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S *)&beta, c_, DT_C, + get_rocblas_operation(transb), m, n, k, (rocDataType_S*)&alpha, + a_, DT_A, lda, b_, DT_B, ldb, (rocDataType_S*)&beta, c_, DT_C, ldc, c_, DT_C, ldc, CT, rocblas_gemm_algo_standard, 0, 0); }); }); @@ -528,10 +528,10 @@ inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE #define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A, \ ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b, \ - int64_t ldb, TYPE_S beta, TYPE_C *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, const TYPE_B* b, \ + int64_t ldb, TYPE_S beta, TYPE_C* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, \ ROCMCOMPUTETYPE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ @@ -551,24 +551,24 @@ GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocbl #undef GEMM_EX_LAUNCHER_USM template -inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event symm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, const T* a, int64_t lda, const T* b, int64_t ldb, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), - get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_fill_mode(upper_lower), m, n, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); @@ -576,10 +576,10 @@ inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upp } #define SYMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -592,24 +592,24 @@ SYMM_LAUNCHER_USM(std::complex, rocblas_zsymm) #undef SYMM_LAUNCHER_USM template -inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event hemm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, const T* a, int64_t lda, const T* b, int64_t ldb, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), - get_rocblas_fill_mode(upper_lower), m, n, (rocDataType *)&alpha, - a_, lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_fill_mode(upper_lower), m, n, (rocDataType*)&alpha, + a_, lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); @@ -617,10 +617,10 @@ inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upp } #define HEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -631,23 +631,23 @@ HEMM_LAUNCHER_USM(std::complex, rocblas_zhemm) #undef HEMM_LAUNCHER_USM template -inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syrk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, const T* a, int64_t lda, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, (rocDataType*)&beta, c_, ldc); }); }); @@ -655,9 +655,9 @@ inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define SYRK_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ dependencies); \ } @@ -670,25 +670,25 @@ SYRK_LAUNCHER_USM(std::complex, rocblas_zsyrk) #undef SYRK_LAUNCHER_USM template -inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, const ScalarType alpha, const DataType *a, int64_t lda, - const ScalarType beta, DataType *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event herk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, const ScalarType alpha, const DataType* a, int64_t lda, + const ScalarType beta, DataType* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; using rocScalarType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocScalarType *)&alpha, a_, - lda, (rocScalarType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocScalarType*)&alpha, a_, + lda, (rocScalarType*)&beta, c_, ldc); }); }); @@ -696,10 +696,10 @@ inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ - const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const SCALAR_TYPE alpha, const DATA_TYPE* a, int64_t lda, \ + const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ dependencies); \ } @@ -710,25 +710,25 @@ HERK_LAUNCHER_USM(std::complex, double, rocblas_zherk) #undef HERK_LAUNCHER_USM template -inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b, - int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syr2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* b, + int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, b_, ldb, (rocDataType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, b_, ldb, (rocDataType*)&beta, c_, ldc); }); }); @@ -736,10 +736,10 @@ inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transp } #define SYR2K_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -752,26 +752,26 @@ SYR2K_LAUNCHER_USM(std::complex, rocblas_zsyr2k) #undef SYR2K_LAUNCHER_USM template -inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, const DataType alpha, const DataType *a, int64_t lda, - const DataType *b, int64_t ldb, const ScalarType beta, DataType *c, - int64_t ldc, const std::vector &dependencies) { +inline sycl::event her2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, const DataType alpha, const DataType* a, int64_t lda, + const DataType* b, int64_t ldb, const ScalarType beta, DataType* c, + int64_t ldc, const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; using rocScalarType = typename RocEquivalentType::Type; overflow_check(n, k, lda, ldb, ldc); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_fill_mode(upper_lower), - get_rocblas_operation(trans), n, k, (rocDataType *)&alpha, a_, - lda, b_, ldb, (rocScalarType *)&beta, c_, ldc); + get_rocblas_operation(trans), n, k, (rocDataType*)&alpha, a_, + lda, b_, ldb, (rocScalarType*)&beta, c_, ldc); }); }); @@ -779,10 +779,10 @@ inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transp } #define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b, \ - int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const DATA_TYPE alpha, const DATA_TYPE* a, int64_t lda, const DATA_TYPE* b, \ + int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -797,26 +797,26 @@ HER2K_LAUNCHER_USM(std::complex, double, rocblas_zher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a, - int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event trmm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T* a, + int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; #if ROCBLAS_VERSION_MAJOR >= 4 ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, b_, ldb, b_, ldb); + m, n, (rocDataType*)&alpha, a_, lda, b_, ldb, b_, ldb); #else ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), @@ -830,9 +830,9 @@ inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upp } #define TRMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, dependencies); \ } @@ -845,25 +845,25 @@ TRMM_LAUNCHER_USM(std::complex, rocblas_ztrmm) #undef TRMM_LAUNCHER_USM template -inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a, - int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event trsm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T* a, + int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { using rocDataType = typename RocEquivalentType::Type; overflow_check(m, n, lda, ldb); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler &sc) { + onemkl_rocblas_host_task(cgh, queue, [=](RocblasScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCBLAS_ERROR_FUNC_SYNC(func, err, handle, get_rocblas_side_mode(left_right), get_rocblas_fill_mode(upper_lower), get_rocblas_operation(trans), get_rocblas_diag_type(unit_diag), - m, n, (rocDataType *)&alpha, a_, lda, b_, ldb); + m, n, (rocDataType*)&alpha, a_, lda, b_, ldb); }); }); @@ -871,9 +871,9 @@ inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upp } #define TRSM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, dependencies); \ } @@ -892,9 +892,9 @@ namespace row_major { // Buffer APIs template -inline void gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, int64_t m, - int64_t n, int64_t k, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void gemm(Func func, sycl::queue& queue, transpose transa, transpose transb, int64_t m, + int64_t n, int64_t k, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { auto new_transa = transb; auto new_transb = transa; @@ -903,9 +903,9 @@ inline void gemm(Func func, sycl::queue &queue, transpose transa, transpose tran } #define GEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE beta, sycl::buffer& c, \ int64_t ldc) { \ gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ @@ -921,9 +921,9 @@ GEMM_LAUNCHER(std::complex, rocblas_zgemm) template inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, COMPUTETYPE CT, - sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, - int64_t k, T_S alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, T_S beta, sycl::buffer &c, + sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, + int64_t k, T_S alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, T_S beta, sycl::buffer& c, int64_t ldc) { auto new_transa = transb; auto new_transb = transa; @@ -934,9 +934,9 @@ inline void gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C #define GEMM_EX_LAUNCHER(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A, \ ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE) \ - void gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, TYPE_S beta, sycl::buffer &c, \ + void gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, TYPE_S beta, sycl::buffer& c, \ int64_t ldc) { \ gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE, \ queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); \ @@ -956,9 +956,9 @@ GEMM_EX_LAUNCHER(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocblas_d #undef GEMM_EX_LAUNCHER template -inline void symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void symm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -968,9 +968,9 @@ inline void symm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe } #define SYMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \ c, ldc); \ } @@ -983,9 +983,9 @@ SYMM_LAUNCHER(std::complex, rocblas_zsymm) #undef SYMM_LAUNCHER template -inline void hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void hemm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -995,9 +995,9 @@ inline void hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe } #define HEMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, \ c, ldc); \ } @@ -1008,9 +1008,9 @@ HEMM_LAUNCHER(std::complex, rocblas_zhemm) #undef HEMM_LAUNCHER template -inline void syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, T beta, - sycl::buffer &c, int64_t ldc) { +inline void syrk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, T beta, + sycl::buffer& c, int64_t ldc) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -1020,9 +1020,9 @@ inline void syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define SYRK_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \ } @@ -1034,9 +1034,9 @@ SYRK_LAUNCHER(std::complex, rocblas_zsyrk) #undef SYRK_LAUNCHER template -inline void herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, ScalarType alpha, sycl::buffer &a, int64_t lda, - ScalarType beta, sycl::buffer &c, int64_t ldc) { +inline void herk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, ScalarType alpha, sycl::buffer& a, int64_t lda, + ScalarType beta, sycl::buffer& c, int64_t ldc) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans @@ -1046,9 +1046,9 @@ inline void herk(Func func, sycl::queue &queue, uplo upper_lower, transpose tran } #define HERK_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - void herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - SCALAR_TYPE alpha, sycl::buffer &a, int64_t lda, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + SCALAR_TYPE alpha, sycl::buffer& a, int64_t lda, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); \ } @@ -1058,9 +1058,9 @@ HERK_LAUNCHER(std::complex, double, rocblas_zherk) #undef HERK_LAUNCHER template -inline void syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, - int64_t ldb, T beta, sycl::buffer &c, int64_t ldc) { +inline void syr2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, + int64_t ldb, T beta, sycl::buffer& c, int64_t ldc) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -1071,9 +1071,9 @@ inline void syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose tra } #define SYR2K_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, sycl::buffer &a, int64_t lda, sycl::buffer &b, \ - int64_t ldb, TYPE beta, sycl::buffer &c, int64_t ldc) { \ + void syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, sycl::buffer& a, int64_t lda, sycl::buffer& b, \ + int64_t ldb, TYPE beta, sycl::buffer& c, int64_t ldc) { \ syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ } @@ -1086,10 +1086,10 @@ SYR2K_LAUNCHER(std::complex, rocblas_zsyr2k) #undef SYR2K_LAUNCHER template -inline void her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, DataType alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb, ScalarType beta, - sycl::buffer &c, int64_t ldc) { +inline void her2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, DataType alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb, ScalarType beta, + sycl::buffer& c, int64_t ldc) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans @@ -1101,10 +1101,10 @@ inline void her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose tra } #define HER2K_LAUNCHER(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - void her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - DATA_TYPE alpha, sycl::buffer &a, int64_t lda, \ - sycl::buffer &b, int64_t ldb, SCALAR_TYPE beta, \ - sycl::buffer &c, int64_t ldc) { \ + void her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + DATA_TYPE alpha, sycl::buffer& a, int64_t lda, \ + sycl::buffer& b, int64_t ldb, SCALAR_TYPE beta, \ + sycl::buffer& c, int64_t ldc) { \ her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, \ ldc); \ } @@ -1119,9 +1119,9 @@ HER2K_LAUNCHER(std::complex, double, rocblas_zher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { +inline void trmm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1132,9 +1132,9 @@ inline void trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe } #define TRMM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \ lda, b, ldb); \ } @@ -1147,9 +1147,9 @@ TRMM_LAUNCHER(std::complex, rocblas_ztrmm) #undef TRMM_LAUNCHER template -inline void trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, - diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer &a, int64_t lda, - sycl::buffer &b, int64_t ldb) { +inline void trsm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, + diag unit_diag, int64_t m, int64_t n, T alpha, sycl::buffer& a, int64_t lda, + sycl::buffer& b, int64_t ldb) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1160,9 +1160,9 @@ inline void trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lowe } #define TRSM_LAUNCHER(TYPE, ROCBLAS_ROUTINE) \ - void trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer &a, \ - int64_t lda, sycl::buffer &b, int64_t ldb) { \ + void trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, sycl::buffer& a, \ + int64_t lda, sycl::buffer& b, int64_t ldb) { \ trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, \ lda, b, ldb); \ } @@ -1177,10 +1177,10 @@ TRSM_LAUNCHER(std::complex, rocblas_ztrsm) // USM APIs template -inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, T alpha, const T *a, int64_t lda, - const T *b, int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event gemm(Func func, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, int64_t k, T alpha, const T* a, int64_t lda, + const T* b, int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { auto new_transa = transb; auto new_transb = transa; @@ -1189,10 +1189,10 @@ inline sycl::event gemm(Func func, sycl::queue &queue, transpose transa, transpo } #define GEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, \ - int64_t ldb, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, \ + int64_t ldb, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm(ROCBLAS_ROUTINE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, \ c, ldc, dependencies); \ } @@ -1207,10 +1207,10 @@ GEMM_LAUNCHER_USM(std::complex, rocblas_zgemm) template inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE_C DT_C, - COMPUTETYPE CT, sycl::queue &queue, transpose transa, transpose transb, - int64_t m, int64_t n, int64_t k, T_S alpha, const T_A *a, int64_t lda, - const T_B *b, int64_t ldb, T_S beta, T_C *c, int64_t ldc, - const std::vector &dependencies) { + COMPUTETYPE CT, sycl::queue& queue, transpose transa, transpose transb, + int64_t m, int64_t n, int64_t k, T_S alpha, const T_A* a, int64_t lda, + const T_B* b, int64_t ldb, T_S beta, T_C* c, int64_t ldc, + const std::vector& dependencies) { auto new_transa = transb; auto new_transb = transa; @@ -1220,10 +1220,10 @@ inline sycl::event gemm_ex(Func func, DATATYPE_A DT_A, DATATYPE_B DT_B, DATATYPE #define GEMM_EX_LAUNCHER_USM(TYPE_A, TYPE_B, TYPE_C, TYPE_S, ROCBLAS_ROUTINE, ROCMDATATYPE_A, \ ROCMDATATYPE_B, ROCMDATATYPE_C, ROCMCOMPUTETYPE) \ - sycl::event gemm(sycl::queue &queue, transpose transa, transpose transb, int64_t m, int64_t n, \ - int64_t k, TYPE_S alpha, const TYPE_A *a, int64_t lda, const TYPE_B *b, \ - int64_t ldb, TYPE_S beta, TYPE_C *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event gemm(sycl::queue& queue, transpose transa, transpose transb, int64_t m, int64_t n, \ + int64_t k, TYPE_S alpha, const TYPE_A* a, int64_t lda, const TYPE_B* b, \ + int64_t ldb, TYPE_S beta, TYPE_C* c, int64_t ldc, \ + const std::vector& dependencies) { \ return gemm_ex(ROCBLAS_ROUTINE, ROCMDATATYPE_A, ROCMDATATYPE_B, ROCMDATATYPE_C, \ ROCMCOMPUTETYPE, queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ @@ -1243,9 +1243,9 @@ GEMM_EX_LAUNCHER_USM(bfloat16, bfloat16, bfloat16, float, rocblas_gemm_ex, rocbl #undef GEMM_EX_LAUNCHER_USM template -inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event symm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, const T* a, int64_t lda, const T* b, int64_t ldb, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1256,10 +1256,10 @@ inline sycl::event symm(Func func, sycl::queue &queue, side left_right, uplo upp } #define SYMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event symm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event symm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return symm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -1272,9 +1272,9 @@ SYMM_LAUNCHER_USM(std::complex, rocblas_zsymm) #undef SYMM_LAUNCHER_USM template -inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, - int64_t n, T alpha, const T *a, int64_t lda, const T *b, int64_t ldb, - T beta, T *c, int64_t ldc, const std::vector &dependencies) { +inline sycl::event hemm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, + int64_t n, T alpha, const T* a, int64_t lda, const T* b, int64_t ldb, + T beta, T* c, int64_t ldc, const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1285,10 +1285,10 @@ inline sycl::event hemm(Func func, sycl::queue &queue, side left_right, uplo upp } #define HEMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event hemm(sycl::queue &queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event hemm(sycl::queue& queue, side left_right, uplo upper_lower, int64_t m, int64_t n, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return hemm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -1299,9 +1299,9 @@ HEMM_LAUNCHER_USM(std::complex, rocblas_zhemm) #undef HEMM_LAUNCHER_USM template -inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, T alpha, const T *a, int64_t lda, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syrk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, T alpha, const T* a, int64_t lda, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -1312,9 +1312,9 @@ inline sycl::event syrk(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define SYRK_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syrk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syrk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syrk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ dependencies); \ } @@ -1327,10 +1327,10 @@ SYRK_LAUNCHER_USM(std::complex, rocblas_zsyrk) #undef SYRK_LAUNCHER_USM template -inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, - int64_t k, const ScalarType alpha, const DataType *a, int64_t lda, - const ScalarType beta, DataType *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event herk(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, + int64_t k, const ScalarType alpha, const DataType* a, int64_t lda, + const ScalarType beta, DataType* c, int64_t ldc, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans @@ -1341,10 +1341,10 @@ inline sycl::event herk(Func func, sycl::queue &queue, uplo upper_lower, transpo } #define HERK_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - sycl::event herk(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const SCALAR_TYPE alpha, const DATA_TYPE *a, int64_t lda, \ - const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event herk(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const SCALAR_TYPE alpha, const DATA_TYPE* a, int64_t lda, \ + const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return herk(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, \ dependencies); \ } @@ -1355,10 +1355,10 @@ HERK_LAUNCHER_USM(std::complex, double, rocblas_zherk) #undef HERK_LAUNCHER_USM template -inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, T alpha, const T *a, int64_t lda, const T *b, - int64_t ldb, T beta, T *c, int64_t ldc, - const std::vector &dependencies) { +inline sycl::event syr2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, T alpha, const T* a, int64_t lda, const T* b, + int64_t ldb, T beta, T* c, int64_t ldc, + const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::trans @@ -1369,10 +1369,10 @@ inline sycl::event syr2k(Func func, sycl::queue &queue, uplo upper_lower, transp } #define SYR2K_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event syr2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - TYPE alpha, const TYPE *a, int64_t lda, const TYPE *b, int64_t ldb, \ - TYPE beta, TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event syr2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + TYPE alpha, const TYPE* a, int64_t lda, const TYPE* b, int64_t ldb, \ + TYPE beta, TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return syr2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -1385,10 +1385,10 @@ SYR2K_LAUNCHER_USM(std::complex, rocblas_zsyr2k) #undef SYR2K_LAUNCHER_USM template -inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transpose trans, - int64_t n, int64_t k, const DataType alpha, const DataType *a, int64_t lda, - const DataType *b, int64_t ldb, const ScalarType beta, DataType *c, - int64_t ldc, const std::vector &dependencies) { +inline sycl::event her2k(Func func, sycl::queue& queue, uplo upper_lower, transpose trans, + int64_t n, int64_t k, const DataType alpha, const DataType* a, int64_t lda, + const DataType* b, int64_t ldb, const ScalarType beta, DataType* c, + int64_t ldc, const std::vector& dependencies) { auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper : oneapi::mkl::uplo::lower; auto new_trans = trans == oneapi::mkl::transpose::nontrans ? oneapi::mkl::transpose::conjtrans @@ -1400,10 +1400,10 @@ inline sycl::event her2k(Func func, sycl::queue &queue, uplo upper_lower, transp } #define HER2K_LAUNCHER_USM(DATA_TYPE, SCALAR_TYPE, ROCBLAS_ROUTINE) \ - sycl::event her2k(sycl::queue &queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ - const DATA_TYPE alpha, const DATA_TYPE *a, int64_t lda, const DATA_TYPE *b, \ - int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE *c, int64_t ldc, \ - const std::vector &dependencies) { \ + sycl::event her2k(sycl::queue& queue, uplo upper_lower, transpose trans, int64_t n, int64_t k, \ + const DATA_TYPE alpha, const DATA_TYPE* a, int64_t lda, const DATA_TYPE* b, \ + int64_t ldb, const SCALAR_TYPE beta, DATA_TYPE* c, int64_t ldc, \ + const std::vector& dependencies) { \ return her2k(ROCBLAS_ROUTINE, queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, \ beta, c, ldc, dependencies); \ } @@ -1418,10 +1418,10 @@ HER2K_LAUNCHER_USM(std::complex, double, rocblas_zher2k) // separated from the B matrix. It is possible to use B instead of C, but this // will slow-down the code. template -inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a, - int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event trmm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T* a, + int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1432,9 +1432,9 @@ inline sycl::event trmm(Func func, sycl::queue &queue, side left_right, uplo upp } #define TRMM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trmm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trmm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trmm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, dependencies); \ } @@ -1447,10 +1447,10 @@ TRMM_LAUNCHER_USM(std::complex, rocblas_ztrmm) #undef TRMM_LAUNCHER_USM template -inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upper_lower, - transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T *a, - int64_t lda, T *b, int64_t ldb, - const std::vector &dependencies) { +inline sycl::event trsm(Func func, sycl::queue& queue, side left_right, uplo upper_lower, + transpose trans, diag unit_diag, int64_t m, int64_t n, T alpha, const T* a, + int64_t lda, T* b, int64_t ldb, + const std::vector& dependencies) { auto new_side = left_right == oneapi::mkl::side::left ? oneapi::mkl::side::right : oneapi::mkl::side::left; auto new_uplo = upper_lower == oneapi::mkl::uplo::lower ? oneapi::mkl::uplo::upper @@ -1461,9 +1461,9 @@ inline sycl::event trsm(Func func, sycl::queue &queue, side left_right, uplo upp } #define TRSM_LAUNCHER_USM(TYPE, ROCBLAS_ROUTINE) \ - sycl::event trsm(sycl::queue &queue, side left_right, uplo upper_lower, transpose trans, \ - diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE *a, int64_t lda, \ - TYPE *b, int64_t ldb, const std::vector &dependencies) { \ + sycl::event trsm(sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, \ + diag unit_diag, int64_t m, int64_t n, TYPE alpha, const TYPE* a, int64_t lda, \ + TYPE* b, int64_t ldb, const std::vector& dependencies) { \ return trsm(ROCBLAS_ROUTINE, queue, left_right, upper_lower, trans, unit_diag, m, n, \ alpha, a, lda, b, ldb, dependencies); \ } diff --git a/src/blas/backends/rocblas/rocblas_scope_handle.cpp b/src/blas/backends/rocblas/rocblas_scope_handle.cpp index 5edca96f7..d9c4d2f2e 100644 --- a/src/blas/backends/rocblas/rocblas_scope_handle.cpp +++ b/src/blas/backends/rocblas/rocblas_scope_handle.cpp @@ -26,7 +26,7 @@ namespace rocblas { template rocblas_handle_container::~rocblas_handle_container() noexcept(false) { - for (auto &handle_pair : rocblas_handle_container_mapper_) { + for (auto& handle_pair : rocblas_handle_container_mapper_) { rocblas_status err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); @@ -59,7 +59,7 @@ thread_local rocblas_handle_container RocblasScopedContextHandler::h #endif RocblasScopedContextHandler::RocblasScopedContextHandler(sycl::queue queue, - sycl::interop_handle &ih) + sycl::interop_handle& ih) : interop_h(ih), needToRecover_(false) { placedContext_ = new sycl::context(queue.get_context()); @@ -89,8 +89,8 @@ RocblasScopedContextHandler::~RocblasScopedContextHandler() noexcept(false) { delete placedContext_; } -void ContextCallback(void *userData) { - auto *ptr = static_cast *>(userData); +void ContextCallback(void* userData) { + auto* ptr = static_cast*>(userData); if (!ptr) { return; } @@ -108,7 +108,7 @@ void ContextCallback(void *userData) { } } -rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) { +rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue& queue) { auto hipDevice = interop_h.get_native_device(); hipError_t hipErr; hipCtx_t desired; @@ -154,10 +154,10 @@ rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) return handle; } -hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue &queue) { +hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue& queue) { return sycl::get_native(queue); } -sycl::context RocblasScopedContextHandler::get_context(const sycl::queue &queue) { +sycl::context RocblasScopedContextHandler::get_context(const sycl::queue& queue) { return queue.get_context(); } diff --git a/src/blas/backends/rocblas/rocblas_scope_handle.hpp b/src/blas/backends/rocblas/rocblas_scope_handle.hpp index 734e58fb1..1e2af9873 100644 --- a/src/blas/backends/rocblas/rocblas_scope_handle.hpp +++ b/src/blas/backends/rocblas/rocblas_scope_handle.hpp @@ -45,29 +45,29 @@ namespace rocblas { template struct rocblas_handle_container { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t rocblas_handle_container_mapper_{}; ~rocblas_handle_container() noexcept(false); }; class RocblasScopedContextHandler { HIPcontext original_; - sycl::context *placedContext_; + sycl::context* placedContext_; bool needToRecover_; - sycl::interop_handle &interop_h; + sycl::interop_handle& interop_h; #ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED static thread_local rocblas_handle_container handle_helper; #else static thread_local rocblas_handle_container handle_helper; #endif - sycl::context get_context(const sycl::queue &queue); - hipStream_t get_stream(const sycl::queue &queue); + sycl::context get_context(const sycl::queue& queue); + hipStream_t get_stream(const sycl::queue& queue); public: - RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); ~RocblasScopedContextHandler() noexcept(false); - rocblas_handle get_handle(const sycl::queue &queue); + rocblas_handle get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. diff --git a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp index da9791411..64d883b52 100644 --- a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp +++ b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.cpp @@ -27,7 +27,7 @@ namespace blas { namespace rocblas { rocblas_handle_container::~rocblas_handle_container() noexcept(false) { - for (auto &handle_pair : rocblas_handle_mapper_) { + for (auto& handle_pair : rocblas_handle_mapper_) { rocblas_status err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); @@ -46,10 +46,10 @@ thread_local rocblas_handle_container RocblasScopedContextHandler::handle_helper rocblas_handle_container{}; RocblasScopedContextHandler::RocblasScopedContextHandler(sycl::queue queue, - sycl::interop_handle &ih) + sycl::interop_handle& ih) : interop_h(ih) {} -rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) { +rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue& queue) { sycl::device device = queue.get_device(); int current_device = interop_h.get_native_device(); hipStream_t streamId = get_stream(queue); @@ -84,7 +84,7 @@ rocblas_handle RocblasScopedContextHandler::get_handle(const sycl::queue &queue) return handle; } -hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue &queue) { +hipStream_t RocblasScopedContextHandler::get_stream(const sycl::queue& queue) { return interop_h.get_native_queue(); } diff --git a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp index 3c156ab6c..07d0d8292 100644 --- a/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp +++ b/src/blas/backends/rocblas/rocblas_scope_handle_hipsycl.hpp @@ -35,7 +35,7 @@ namespace blas { namespace rocblas { struct rocblas_handle_container { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t rocblas_handle_mapper_{}; ~rocblas_handle_container() noexcept(false); }; @@ -43,13 +43,13 @@ struct rocblas_handle_container { class RocblasScopedContextHandler { sycl::interop_handle interop_h; static thread_local rocblas_handle_container handle_helper; - sycl::context get_context(const sycl::queue &queue); - hipStream_t get_stream(const sycl::queue &queue); + sycl::context get_context(const sycl::queue& queue); + hipStream_t get_stream(const sycl::queue& queue); public: - RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + RocblasScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); - rocblas_handle get_handle(const sycl::queue &queue); + rocblas_handle get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. diff --git a/src/blas/backends/rocblas/rocblas_task.hpp b/src/blas/backends/rocblas/rocblas_task.hpp index a52bd4c2e..51d9e3db7 100644 --- a/src/blas/backends/rocblas/rocblas_task.hpp +++ b/src/blas/backends/rocblas/rocblas_task.hpp @@ -53,7 +53,7 @@ namespace rocblas { #ifdef __HIPSYCL__ template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.hipSYCL_enqueue_custom_operation([f, queue](sycl::interop_handle ih) { auto sc = RocblasScopedContextHandler(queue, ih); f(sc); @@ -61,7 +61,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } #else template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.host_task([f, queue](sycl::interop_handle ih) { auto sc = RocblasScopedContextHandler(queue, ih); f(sc); @@ -69,7 +69,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } #endif template -static inline void onemkl_rocblas_host_task(H &cgh, sycl::queue queue, F f) { +static inline void onemkl_rocblas_host_task(H& cgh, sycl::queue queue, F f) { (void)host_task_internal(cgh, queue, f); } diff --git a/src/blas/blas_loader.cpp b/src/blas/blas_loader.cpp index c1f1339c6..3150199a4 100644 --- a/src/blas/blas_loader.cpp +++ b/src/blas/blas_loader.cpp @@ -32,3931 +32,3931 @@ static oneapi::mkl::detail::table_initializer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_scasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_dzasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_sasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_dasum_sycl(queue, n, x, incx, result); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_saxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_daxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_caxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zaxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_saxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_daxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_caxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_zaxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_saxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_daxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_caxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zaxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_scopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dcopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_ccopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zcopy_sycl(queue, n, x, incx, y, incy); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_scopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_dcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_ccopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_zcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].column_major_sdot_sycl(queue, n, x, incx, y, incy, result); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].column_major_ddot_sycl(queue, n, x, incx, y, incy, result); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].column_major_dsdot_sycl(queue, n, x, incx, y, incy, result); } -void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].column_major_cdotc_sycl(queue, n, x, incx, y, incy, result); } -void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].column_major_zdotc_sycl(queue, n, x, incx, y, incy, result); } -void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].column_major_cdotu_sycl(queue, n, x, incx, y, incy, result); } -void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].column_major_zdotu_sycl(queue, n, x, incx, y, incy, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_isamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_idamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_icamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_izamin_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_isamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_idamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_icamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_izamax_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_scnrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].column_major_dznrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_snrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].column_major_dnrm2_sycl(queue, n, x, incx, result); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { function_tables[libkey].column_major_srot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { function_tables[libkey].column_major_drot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { function_tables[libkey].column_major_csrot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { function_tables[libkey].column_major_zdrot_sycl(queue, n, x, incx, y, incy, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { function_tables[libkey].column_major_srotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { function_tables[libkey].column_major_drotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { function_tables[libkey].column_major_crotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { function_tables[libkey].column_major_zrotg_sycl(queue, a, b, c, s); } -void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { function_tables[libkey].column_major_srotm_sycl(queue, n, x, incx, y, incy, param); } -void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param) { function_tables[libkey].column_major_drotm_sycl(queue, n, x, incx, y, incy, param); } -void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { function_tables[libkey].column_major_srotmg_sycl(queue, d1, d2, x1, y1, param); } -void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { function_tables[libkey].column_major_drotmg_sycl(queue, d1, d2, x1, y1, param); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_sscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_cscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_csscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_zscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_zdscal_sycl(queue, n, alpha, x, incx); } -void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { +void sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { function_tables[libkey].column_major_sdsdot_sycl(queue, n, sb, x, incx, y, incy, result); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_sswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_cswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zswap_sycl(queue, n, x, incx, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_sgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_cgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_sgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_cgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_sgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, std::int64_t incy, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_dgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_cgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].column_major_zgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].column_major_sdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].column_major_ddgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].column_major_cdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].column_major_zdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_sger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_dger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_cgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_zgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_cgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_zgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_chbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zhbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_chemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zhemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_cher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_zher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_cher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].column_major_zher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_chpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].column_major_zhpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { function_tables[libkey].column_major_chpr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { function_tables[libkey].column_major_zhpr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { function_tables[libkey].column_major_chpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { function_tables[libkey].column_major_zhpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_ssbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dsbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { +void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_sspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy) { +void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { +void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { function_tables[libkey].column_major_sspr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { +void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { function_tables[libkey].column_major_dspr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { +void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { function_tables[libkey].column_major_sspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { +void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { function_tables[libkey].column_major_dspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { +void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_ssymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { +void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].column_major_dsymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_ssyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_dsyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_ssyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].column_major_dsyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_stbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_stbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_stpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_stpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_strmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_strsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].column_major_dtrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ctrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].column_major_ztrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_sgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_dgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_cgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_hgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_gemm_f16f16f32_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_gemm_bf16bf16f32_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_chemm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zhemm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_cherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1> &a, - std::int64_t lda, double beta, sycl::buffer, 1> &c, +void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_cher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_ssymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_dsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_csymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &c, std::int64_t ldc) { +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_ssyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_dsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_csyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_ssyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &c, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_dsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_csyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_zsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_ssyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_dsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_csyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_strmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_dtrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_ctrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_ztrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_strsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_dtrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_ctrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_ztrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_sgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_dgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_cgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_zgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_hgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_gemm_f16f16f32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_gemm_s8s8f32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_gemm_s8s8s32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_strsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_dtrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_ctrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_ztrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_sgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_dgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_cgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].column_major_gemm_s8u8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].column_major_gemm_s8s8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].column_major_gemm_u8s8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].column_major_gemm_u8u8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_somatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_domatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_comatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].column_major_zomatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].column_major_simatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].column_major_dimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].column_major_cimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].column_major_zimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, float beta, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, float beta, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_somatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, double beta, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_domatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_comatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].column_major_zomatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_somatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].column_major_domatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_comatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].column_major_zomatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].column_major_somatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].column_major_domatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].column_major_comatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].column_major_zomatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].column_major_simatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].column_major_dimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].column_major_cimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].column_major_zimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_somatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].column_major_domatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_comatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].column_major_zomatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } // USM APIs -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_scasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_dzasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return function_tables[libkey].column_major_sasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return function_tables[libkey].column_major_dasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + float* alpha, const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_saxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + double* alpha, const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_daxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_caxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_zaxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_saxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_daxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_caxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zaxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_saxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_daxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_caxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zaxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_scopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_dcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_ccopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_zcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_scopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_dcopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_ccopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_zcopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_scopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_dcopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_ccopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_zcopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_sdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_ddot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_cdotc_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_zdotc_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_cdotu_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_zdotu_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_isamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_idamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_icamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_izamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_isamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_idamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_icamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_izamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_scnrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { return function_tables[libkey].column_major_dznrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return function_tables[libkey].column_major_snrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return function_tables[libkey].column_major_dnrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, float c, float s, const std::vector& dependencies) { return function_tables[libkey].column_major_srot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_drot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { return function_tables[libkey].column_major_csrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { return function_tables[libkey].column_major_zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, float* a, float* b, float* c, + float* s, const std::vector& dependencies) { return function_tables[libkey].column_major_srotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { return function_tables[libkey].column_major_drotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { return function_tables[libkey].column_major_crotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { return function_tables[libkey].column_major_zrotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { return function_tables[libkey].column_major_srotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { return function_tables[libkey].column_major_drotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { return function_tables[libkey].column_major_srotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, double* d1, double* d2, + double* x1, double y1, double* param, + const std::vector& dependencies) { return function_tables[libkey].column_major_drotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_sscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_dscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_cscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_csscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_zscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_zdscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { +sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float sb, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { return function_tables[libkey].column_major_sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy, result, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_sswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_dswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_cswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_zswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_sgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_dgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_cgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_zgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, std::int64_t incy, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_sgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, std::int64_t incy, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_dgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_cgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_sgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_dgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_cgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_sdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_ddgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_cdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_sdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_ddgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_cdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_zdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { +sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_chbmv_usm_sycl( queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zhbmv_usm_sycl( queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_chemv_usm_sycl( queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zhemv_usm_sycl( queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_cher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_zher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_cher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_zher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { +sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { return function_tables[libkey].column_major_chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { +sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { return function_tables[libkey].column_major_zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return function_tables[libkey].column_major_chpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return function_tables[libkey].column_major_zhpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssbmv_usm_sycl( queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsbmv_usm_sycl( queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].column_major_dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { +sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { return function_tables[libkey].column_major_sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { +sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { return function_tables[libkey].column_major_dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, const std::vector &dependencies) { +sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { return function_tables[libkey].column_major_sspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, const std::vector &dependencies) { +sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { return function_tables[libkey].column_major_dspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssymv_usm_sycl( queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsymv_usm_sycl( queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { +sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].column_major_ssyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_stbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_stbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_stpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_stpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_strmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_strsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].column_major_dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_sgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_dgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_cgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_zgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_hgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_f16f16f32_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_bf16bf16f32_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_chemm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_zhemm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_cherk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_zherk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_cher2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_zher2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_csymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_zsymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssyrk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyrk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_csyrk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_zsyrk_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_csyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_zsyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, float *c, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_ssyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_csyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zsyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_ssyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_dsyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_csyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_zsyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_strmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_dtrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_strsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_dtrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_strsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_dtrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_strsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_dtrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_ctrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_ztrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_sgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_dgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **b, std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].column_major_cgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_hgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_f16f16f32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8s8f32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8s8s32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_sgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_dgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_cgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_hgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_f16f16f32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8s8f32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8s8s32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, const std::vector &dependencies) { + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, float beta, + float* c, std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_sgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, - double *c, std::int64_t ldc, const std::vector &dependencies) { + const double* a, std::int64_t lda, const double* b, std::int64_t ldb, double beta, + double* c, std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_dgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_cgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].column_major_zgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8u8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_s8s8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_u8s8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].column_major_gemm_u8u8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_somatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_domatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_comatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_zomatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_simatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_dimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_cimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_zimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_somatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].column_major_domatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_comatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].column_major_zomatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float *b, std::int64_t ldb, const std::vector &dependencies) { +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].column_major_somatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_domatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].column_major_comatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].column_major_zomatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].column_major_somatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, double *b, std::int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, double* b, std::int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { return function_tables[libkey].column_major_domatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - std::complex *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stridea, + std::complex* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].column_major_comatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - std::complex *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stridea, + std::complex* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].column_major_zomatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, - std::int64_t ldb, const std::vector &dependencies) { +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].column_major_simatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, std::int64_t lda, - std::int64_t ldb, const std::vector &dependencies) { +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].column_major_dimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_cimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].column_major_zimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, std::int64_t ldb, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_somatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, std::int64_t ldb, double* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_domatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_comatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].column_major_zomatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].column_major_somatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].column_major_domatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].column_major_comatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].column_major_zomatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, float **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, float** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].column_major_simatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, double **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, double** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].column_major_dimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].column_major_cimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].column_major_zimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } @@ -3970,3923 +3970,3923 @@ static oneapi::mkl::detail::table_initializer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_scasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_dzasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_sasum_sycl(queue, n, x, incx, result); } -void asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_dasum_sycl(queue, n, x, incx, result); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_saxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_daxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_caxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zaxpy_sycl(queue, n, alpha, x, incx, y, incy); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_saxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_daxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_caxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, std::int64_t stridex, sycl::buffer, 1> &y, +void axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, std::int64_t stridex, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_zaxpy_batch_strided_sycl(queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_saxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_daxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_caxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zaxpby_sycl(queue, n, alpha, x, incx, beta, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_scopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dcopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_ccopy_sycl(queue, n, x, incx, y, incy); } -void copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zcopy_sycl(queue, n, x, incx, y, incy); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_scopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_dcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_ccopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, +void copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_zcopy_batch_strided_sycl(queue, n, x, incx, stridex, y, incy, stridey, batch_size); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].row_major_sdot_sycl(queue, n, x, incx, y, incy, result); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].row_major_ddot_sycl(queue, n, x, incx, y, incy, result); } -void dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result) { +void dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result) { function_tables[libkey].row_major_dsdot_sycl(queue, n, x, incx, y, incy, result); } -void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].row_major_cdotc_sycl(queue, n, x, incx, y, incy, result); } -void dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].row_major_zdotc_sycl(queue, n, x, incx, y, incy, result); } -void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].row_major_cdotu_sycl(queue, n, x, incx, y, incy, result); } -void dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result) { +void dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result) { function_tables[libkey].row_major_zdotu_sycl(queue, n, x, incx, y, incy, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_isamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_idamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_icamin_sycl(queue, n, x, incx, result); } -void iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_izamin_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_isamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_idamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_icamax_sycl(queue, n, x, incx, result); } -void iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_izamax_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_scnrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result) { function_tables[libkey].row_major_dznrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_snrm2_sycl(queue, n, x, incx, result); } -void nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &result) { +void nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& result) { function_tables[libkey].row_major_dnrm2_sycl(queue, n, x, incx, result); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, float s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s) { function_tables[libkey].row_major_srot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, double c, double s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s) { function_tables[libkey].row_major_drot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, float c, float s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s) { function_tables[libkey].row_major_csrot_sycl(queue, n, x, incx, y, incy, c, s); } -void rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, double c, double s) { +void rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s) { function_tables[libkey].row_major_zdrot_sycl(queue, n, x, incx, y, incy, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { function_tables[libkey].row_major_srotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, sycl::buffer &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, sycl::buffer& s) { function_tables[libkey].row_major_drotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { function_tables[libkey].row_major_crotg_sycl(queue, a, b, c, s); } -void rotg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s) { +void rotg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s) { function_tables[libkey].row_major_zrotg_sycl(queue, a, b, c, s); } -void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m) { +void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param) { function_tables[libkey].row_major_srotm_sycl(queue, n, x, incx, y, incy, param); } -void rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m) { +void rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param) { function_tables[libkey].row_major_drotm_sycl(queue, n, x, incx, y, incy, param); } -void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m) { +void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param) { function_tables[libkey].row_major_srotmg_sycl(queue, d1, d2, x1, y1, param); } -void rotmg(oneapi::mkl::device libkey, sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, double y1, - sycl::buffer ¶m) { +void rotmg(oneapi::mkl::device libkey, sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, double y1, + sycl::buffer& param) { function_tables[libkey].row_major_drotmg_sycl(queue, d1, d2, x1, y1, param); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_sscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_cscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_csscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_zscal_sycl(queue, n, alpha, x, incx); } -void scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx) { +void scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_zdscal_sycl(queue, n, alpha, x, incx); } -void sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &result) { +void sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& result) { function_tables[libkey].row_major_sdsdot_sycl(queue, n, sb, x, incx, y, incy, result); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_sswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, sycl::buffer &y, +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_cswap_sycl(queue, n, x, incx, y, incy); } -void swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy) { +void swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zswap_sycl(queue, n, x, incx, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy) { +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_sgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_cgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zgbmv_sycl(queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_sgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_cgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zgemv_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, float beta, sycl::buffer &y, std::int64_t incy, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_sgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, double beta, sycl::buffer &y, std::int64_t incy, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_dgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &x, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy, std::int64_t stridey, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_cgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size) { function_tables[libkey].row_major_zgemv_batch_strided_sycl(queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].row_major_sdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, std::int64_t stridec, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].row_major_ddgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].row_major_cdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, std::int64_t m, - std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer, 1> &c, std::int64_t ldc, +void dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, std::int64_t m, + std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size) { function_tables[libkey].row_major_zdgmm_batch_strided_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size); } -void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_sger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_dger_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_cgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_zgerc_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_cgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_zgeru_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda); } -void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_chbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, std::int64_t incy) { +void hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zhbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_chemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zhemv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_cher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda) { +void her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_zher_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_cher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda) { +void her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda) { function_tables[libkey].row_major_zher2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_chpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy) { +void hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, + sycl::buffer, 1>& y, std::int64_t incy) { function_tables[libkey].row_major_zhpmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { function_tables[libkey].row_major_chpr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a) { +void hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a) { function_tables[libkey].row_major_zhpr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { function_tables[libkey].row_major_chpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a) { +void hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a) { function_tables[libkey].row_major_zhpr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, sycl::buffer &y, +void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_ssbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, sycl::buffer &y, +void sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dsbmv_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } -void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy) { +void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_sspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy) { +void spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dspmv_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy); } -void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { +void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { function_tables[libkey].row_major_sspr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a) { +void spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a) { function_tables[libkey].row_major_dspr_sycl(queue, upper_lower, n, alpha, x, incx, a); } -void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { +void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { function_tables[libkey].row_major_sspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a) { +void spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a) { function_tables[libkey].row_major_dspr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a); } -void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, std::int64_t incy) { +void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_ssymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, sycl::buffer &x, - std::int64_t incx, double beta, sycl::buffer &y, std::int64_t incy) { +void symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, sycl::buffer& x, + std::int64_t incx, double beta, sycl::buffer& y, std::int64_t incy) { function_tables[libkey].row_major_dsymv_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); } -void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_ssyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &a, +void syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_dsyr_sycl(queue, upper_lower, n, alpha, x, incx, a, lda); } -void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_ssyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, std::int64_t lda) { +void syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda) { function_tables[libkey].row_major_dsyr2_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_stbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztbmv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_stbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztbsv_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_stpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztpmv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_stpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, sycl::buffer &x, +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx) { +void tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztpsv_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_strmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztrmv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_strsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx) { function_tables[libkey].row_major_dtrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ctrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, std::int64_t incx) { +void trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx) { function_tables[libkey].row_major_ztrsv_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_sgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_dgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_cgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::half beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::half beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_hgemm_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_gemm_f16f16f32_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_gemm_bf16bf16f32_sycl(queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_chemm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zhemm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1> &a, - std::int64_t lda, float beta, sycl::buffer, 1> &c, std::int64_t ldc) { +void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer, 1>& a, + std::int64_t lda, float beta, sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_cherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1> &a, - std::int64_t lda, double beta, sycl::buffer, 1> &c, +void herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer, 1>& a, + std::int64_t lda, double beta, sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zherk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, float beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, float beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_cher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, double beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, double beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zher2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_ssymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_dsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_csymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zsymm_sycl(queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &c, std::int64_t ldc) { +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_ssyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &c, std::int64_t ldc) { +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_dsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_csyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zsyrk_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer &c, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_ssyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer &c, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_dsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_csyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_zsyrk_batch_strided_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, sycl::buffer &c, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_ssyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc) { +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_dsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_csyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +void syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zsyr2k_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_strmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_dtrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_ctrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_ztrmm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_strsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_dtrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_ctrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb) { + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_ztrsm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_sgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_dgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_cgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_zgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::half beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::half beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_hgemm_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_gemm_f16f16f32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_gemm_s8s8f32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - float beta, sycl::buffer &c, std::int64_t ldc, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_gemm_s8s8s32_batch_strided_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_strsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_dtrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_ctrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +void trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_ztrsm_batch_strided_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, - transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc) { +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, + transpose transb, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_sgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, double beta, sycl::buffer &c, std::int64_t ldc) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, double beta, sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_dgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_cgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose transa, +void gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zgemmt_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].row_major_gemm_s8u8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].row_major_gemm_s8s8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, int8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, int8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].row_major_gemm_u8s8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, - sycl::buffer &b, std::int64_t ldb, uint8_t bo, float beta, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &co) { + sycl::buffer& a, std::int64_t lda, uint8_t ao, + sycl::buffer& b, std::int64_t ldb, uint8_t bo, float beta, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& co) { function_tables[libkey].row_major_gemm_u8u8s32_bias_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_somatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_domatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_comatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { function_tables[libkey].row_major_zomatcopy_batch_strided_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].row_major_simatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].row_major_dimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].row_major_cimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +void imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, std::int64_t ldb, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size) { function_tables[libkey].row_major_zimatcopy_batch_strided_sycl(queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, float beta, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, float beta, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_somatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, double beta, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_domatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, - std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, + std::int64_t ldb, std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_comatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +void omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, + sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size) { function_tables[libkey].row_major_zomatadd_batch_strided_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_somatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb) { function_tables[libkey].row_major_domatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_comatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, std::int64_t ldb) { +void omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb) { function_tables[libkey].row_major_zomatcopy_sycl(queue, trans, m, n, alpha, a, lda, b, ldb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].row_major_somatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, std::int64_t ldb, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].row_major_domatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].row_major_comatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, std::int64_t stridea, sycl::buffer, 1> &b, +void omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb) { function_tables[libkey].row_major_zomatcopy2_sycl(queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].row_major_simatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &ab, std::int64_t lda, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].row_major_dimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].row_major_cimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, sycl::buffer, 1> &ab, +void imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb) { function_tables[libkey].row_major_zimatcopy_sycl(queue, trans, m, n, alpha, ab, lda, ldb); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_somatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc) { +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc) { function_tables[libkey].row_major_domatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_comatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } -void omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +void omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc) { + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc) { function_tables[libkey].row_major_zomatadd_sycl(queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc); } // USM APIs -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_scasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_dzasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return function_tables[libkey].row_major_sasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event asum(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event asum(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return function_tables[libkey].row_major_dasum_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_saxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_daxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_caxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zaxpy_usm_sycl(queue, n, alpha, x, incx, y, incy, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - float *alpha, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + float* alpha, const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_saxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - double *alpha, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + double* alpha, const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_daxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_caxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex *alpha, const std::complex **x, - std::int64_t *incx, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex* alpha, const std::complex** x, + std::int64_t* incx, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_zaxpy_batch_group_usm_sycl( queue, n, alpha, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_saxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_daxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_caxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, +sycl::event axpy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zaxpy_batch_strided_usm_sycl( queue, n, alpha, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_saxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_daxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_caxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event axpby(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event axpby(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zaxpby_usm_sycl(queue, n, alpha, x, incx, beta, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_scopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_dcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_ccopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event copy(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_zcopy_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const float **x, std::int64_t *incx, float **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const float** x, std::int64_t* incx, float** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_scopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const double **x, std::int64_t *incx, double **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const double** x, std::int64_t* incx, double** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_dcopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_ccopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_zcopy_batch_group_usm_sycl( queue, n, x, incx, y, incy, group_count, group_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const float *x, std::int64_t incx, std::int64_t stridex, float *y, +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const float* x, std::int64_t incx, std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_scopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const double *x, std::int64_t incx, std::int64_t stridex, double *y, +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const double* x, std::int64_t incx, std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_dcopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_ccopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event copy_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_zcopy_batch_strided_usm_sycl( queue, n, x, incx, stridex, y, incy, stridey, batch_size, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, float *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, float* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_sdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_ddot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, double *result, - const std::vector &dependencies) { +sycl::event dot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, double* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsdot_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_cdotc_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_zdotc_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_cdotu_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event dotu(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *result, - const std::vector &dependencies) { +sycl::event dotu(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_zdotu_usm_sycl(queue, n, x, incx, y, incy, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_isamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_idamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_icamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamin(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamin(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_izamin_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_isamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_idamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_icamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event iamax(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, std::int64_t *result, - const std::vector &dependencies) { +sycl::event iamax(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, std::int64_t* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_izamax_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, float *result, - const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, float* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_scnrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, double *result, - const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, double* result, + const std::vector& dependencies) { return function_tables[libkey].row_major_dznrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, const std::vector& dependencies) { return function_tables[libkey].row_major_snrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, const std::vector &dependencies) { +sycl::event nrm2(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, const std::vector& dependencies) { return function_tables[libkey].row_major_dnrm2_usm_sycl(queue, n, x, incx, result, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, float c, float s, const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, float c, float s, const std::vector& dependencies) { return function_tables[libkey].row_major_srot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_drot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, float s, - const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, + const std::vector& dependencies) { return function_tables[libkey].row_major_csrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double c, double s, - const std::vector &dependencies) { +sycl::event rot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, + const std::vector& dependencies) { return function_tables[libkey].row_major_zdrot_usm_sycl(queue, n, x, incx, y, incy, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, float *a, float *b, float *c, - float *s, const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, float* a, float* b, float* c, + float* s, const std::vector& dependencies) { return function_tables[libkey].row_major_srotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, double *a, double *b, double *c, - double *s, const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, double* a, double* b, double* c, + double* s, const std::vector& dependencies) { return function_tables[libkey].row_major_drotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex *a, - std::complex *b, float *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, std::complex* a, + std::complex* b, float* c, std::complex* s, + const std::vector& dependencies) { return function_tables[libkey].row_major_crotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotg(oneapi::mkl::device libkey, sycl::queue &queue, std::complex *a, - std::complex *b, double *c, std::complex *s, - const std::vector &dependencies) { +sycl::event rotg(oneapi::mkl::device libkey, sycl::queue& queue, std::complex* a, + std::complex* b, double* c, std::complex* s, + const std::vector& dependencies) { return function_tables[libkey].row_major_zrotg_usm_sycl(queue, a, b, c, s, dependencies); } -sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float *param, - const std::vector &dependencies) { +sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float* param, + const std::vector& dependencies) { return function_tables[libkey].row_major_srotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotm(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, double *param, - const std::vector &dependencies) { +sycl::event rotm(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double* param, + const std::vector& dependencies) { return function_tables[libkey].row_major_drotm_usm_sycl(queue, n, x, incx, y, incy, param, dependencies); } -sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, const std::vector &dependencies) { +sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, float* d1, float* d2, float* x1, + float y1, float* param, const std::vector& dependencies) { return function_tables[libkey].row_major_srotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue &queue, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies) { +sycl::event rotmg(oneapi::mkl::device libkey, sycl::queue& queue, double* d1, double* d2, + double* x1, double y1, double* param, + const std::vector& dependencies) { return function_tables[libkey].row_major_drotmg_usm_sycl(queue, d1, d2, x1, y1, param, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_sscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_dscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_cscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_csscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_zscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event scal(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event scal(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_zdscal_usm_sycl(queue, n, alpha, x, incx, dependencies); } -sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *result, const std::vector &dependencies) { +sycl::event sdsdot(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float sb, + const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* result, const std::vector& dependencies) { return function_tables[libkey].row_major_sdsdot_usm_sycl(queue, n, sb, x, incx, y, incy, result, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_sswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_dswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_cswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event swap(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event swap(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_zswap_usm_sycl(queue, n, x, incx, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, float beta, float *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, float beta, float* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_sgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, double beta, double *y, - std::int64_t incy, const std::vector &dependencies) { +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, double beta, double* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_dgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_cgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, +sycl::event gbmv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *x, - std::int64_t incx, std::complex beta, std::complex *y, - std::int64_t incy, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* x, + std::int64_t incx, std::complex beta, std::complex* y, + std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_zgbmv_usm_sycl( queue, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_sgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_dgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_cgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event gemv(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemv_usm_sycl(queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float beta, float *y, std::int64_t incy, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_sgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double beta, double *y, std::int64_t incy, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_dgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_cgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemv_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, x, incx, stridex, beta, y, incy, stridey, batch_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, const float **x, std::int64_t *incx, float *beta, - float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, const float** x, std::int64_t* incx, float* beta, + float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_sgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, const double **x, std::int64_t *incx, double *beta, - double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, const double** x, std::int64_t* incx, double* beta, + double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_dgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_cgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, - std::complex *beta, std::complex **y, std::int64_t *incy, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemv_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, + std::complex* beta, std::complex** y, std::int64_t* incy, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemv_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, x, incx, beta, y, incy, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const float *a, std::int64_t lda, - std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const float* a, std::int64_t lda, + std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_sdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const double *a, std::int64_t lda, - std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const double* a, std::int64_t lda, + std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_ddgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_cdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, - std::int64_t m, std::int64_t n, const std::complex *a, - std::int64_t lda, std::int64_t stridea, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *c, +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, + std::int64_t m, std::int64_t n, const std::complex* a, + std::int64_t lda, std::int64_t stridea, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zdgmm_batch_strided_usm_sycl( queue, left_right, m, n, a, lda, stridea, x, incx, stridex, c, ldc, stridec, batch_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const float **a, std::int64_t *lda, - const float **x, std::int64_t *incx, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const float** a, std::int64_t* lda, + const float** x, std::int64_t* incx, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_sdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const double **a, std::int64_t *lda, - const double **x, std::int64_t *incx, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const double** a, std::int64_t* lda, + const double** x, std::int64_t* incx, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_ddgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_cdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - std::int64_t *m, std::int64_t *n, const std::complex **a, - std::int64_t *lda, const std::complex **x, std::int64_t *incx, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event dgmm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + std::int64_t* m, std::int64_t* n, const std::complex** a, + std::int64_t* lda, const std::complex** x, std::int64_t* incx, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_zdgmm_batch_group_usm_sycl( queue, left_right, m, n, a, lda, x, incx, c, ldc, group_count, group_size, dependencies); } -sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { +sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_sger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event ger(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event ger(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_dger_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_cgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event gerc(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event gerc(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_zgerc_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_cgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event geru(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event geru(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_zgeru_usm_sycl(queue, m, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_chbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zhbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_chemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event hemv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hemv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zhemv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_cher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event her(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_zher_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_cher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event her2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - std::int64_t lda, const std::vector &dependencies) { +sycl::event her2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_zher2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_chpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *a, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event hpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* a, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_zhpmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { +sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { return function_tables[libkey].row_major_chpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event hpr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const std::complex *x, std::int64_t incx, - std::complex *a, const std::vector &dependencies) { +sycl::event hpr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const std::complex* x, std::int64_t incx, + std::complex* a, const std::vector& dependencies) { return function_tables[libkey].row_major_zhpr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return function_tables[libkey].row_major_chpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::complex alpha, const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, std::complex *a, - const std::vector &dependencies) { +sycl::event hpr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::complex alpha, const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, std::complex* a, + const std::vector& dependencies) { return function_tables[libkey].row_major_zhpr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event sbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsbmv_usm_sycl(queue, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* a, const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_sspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, const std::vector &dependencies) { +sycl::event spmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* a, const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, const std::vector& dependencies) { return function_tables[libkey].row_major_dspmv_usm_sycl(queue, upper_lower, n, alpha, a, x, incx, beta, y, incy, dependencies); } -sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, - const std::vector &dependencies) { +sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, + const std::vector& dependencies) { return function_tables[libkey].row_major_sspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event spr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, - const std::vector &dependencies) { +sycl::event spr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, + const std::vector& dependencies) { return function_tables[libkey].row_major_dspr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, dependencies); } -sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, const std::vector &dependencies) { +sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies) { return function_tables[libkey].row_major_sspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event spr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, const std::vector &dependencies) { +sycl::event spr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, const std::vector& dependencies) { return function_tables[libkey].row_major_dspr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, dependencies); } -sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *a, std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* a, std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event symv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, std::int64_t incy, - const std::vector &dependencies) { +sycl::event symv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsymv_usm_sycl(queue, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy, dependencies); } -sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyr_usm_sycl(queue, upper_lower, n, alpha, x, incx, a, lda, dependencies); } -sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - float alpha, const float *x, std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, const std::vector &dependencies) { +sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + float alpha, const float* x, std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, const std::vector& dependencies) { return function_tables[libkey].row_major_ssyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event syr2(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, std::int64_t n, - double alpha, const double *x, std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies) { +sycl::event syr2(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, std::int64_t n, + double alpha, const double* x, std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyr2_usm_sycl(queue, upper_lower, n, alpha, x, incx, y, incy, a, lda, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_stbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_dtbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztbmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_stbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_dtbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tbsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztbsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, k, a, lda, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_stpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_dtpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztpmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, float *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, float* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_stpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, double *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, double* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_dtpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event tpsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztpsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_strmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_dtrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trmv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trmv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrmv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const float *a, std::int64_t lda, float *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const float* a, std::int64_t lda, float* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_strsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const double *a, std::int64_t lda, double *x, - std::int64_t incx, const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const double* a, std::int64_t lda, double* x, + std::int64_t incx, const std::vector& dependencies) { return function_tables[libkey].row_major_dtrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event trsv(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - diag unit_diag, std::int64_t n, const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies) { +sycl::event trsv(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + diag unit_diag, std::int64_t n, const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrsv_usm_sycl(queue, upper_lower, trans, unit_diag, n, a, lda, x, incx, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_sgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, double beta, double *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, double beta, double* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_dgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_cgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_zgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies) { + const sycl::half* a, std::int64_t lda, const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_hgemm_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_f16f16f32_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16 *a, - std::int64_t lda, const bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event gemm(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const bfloat16* a, + std::int64_t lda, const bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_bf16bf16f32_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_chemm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event hemm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event hemm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_zhemm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const std::complex *a, - std::int64_t lda, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const std::complex* a, + std::int64_t lda, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_cherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event herk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const std::complex *a, - std::int64_t lda, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event herk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const std::complex* a, + std::int64_t lda, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_zherk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_cher2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event her2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event her2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_zher2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_csymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event symm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event symm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_zsymm_usm_sycl( queue, left_right, upper_lower, m, n, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_csyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syrk(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_zsyrk_usm_sycl(queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, float *alpha, - const float **a, std::int64_t *lda, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, float* alpha, + const float** a, std::int64_t* lda, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, double *alpha, - const double **a, std::int64_t *lda, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, double* alpha, + const double** a, std::int64_t* lda, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_csyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo *upper_lower, - transpose *trans, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo* upper_lower, + transpose* trans, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_zsyrk_batch_group_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, - transpose trans, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float beta, float *c, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, + transpose trans, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_ssyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_csyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event syrk_batch(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, std::complex *c, std::int64_t ldc, + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zsyrk_batch_strided_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, stride_a, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - const float *b, std::int64_t ldb, float beta, float *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + const float* b, std::int64_t ldb, float beta, float* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_ssyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - const double *b, std::int64_t ldb, double beta, double *c, std::int64_t ldc, - const std::vector &dependencies) { +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + const double* b, std::int64_t ldb, double beta, double* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_dsyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_csyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, transpose trans, +sycl::event syr2k(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_zsyr2k_usm_sycl( queue, upper_lower, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_strmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_dtrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trmm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trmm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrmm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies) { + const float* a, std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_strsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { + const double* a, std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_dtrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, uplo upper_lower, +sycl::event trsm(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrsm_usm_sycl(queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_strsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_dtrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side left_right, +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side left_right, uplo upper_lower, transpose trans, diag unit_diag, std::int64_t m, - std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, + std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrsm_batch_strided_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_strsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_dtrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_ctrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue &queue, side *left_right, - uplo *upper_lower, transpose *trans, diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event trsm_batch(oneapi::mkl::device libkey, sycl::queue& queue, side* left_right, + uplo* upper_lower, transpose* trans, diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_ztrsm_batch_group_usm_sycl( queue, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b, ldb, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const float **a, std::int64_t *lda, const float **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const float** a, std::int64_t* lda, const float** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_sgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double *alpha, const double **a, std::int64_t *lda, const double **b, - std::int64_t *ldb, double *beta, double **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double* alpha, const double** a, std::int64_t* lda, const double** b, + std::int64_t* ldb, double* beta, double** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_dgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **b, std::int64_t *ldb, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** b, std::int64_t* ldb, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies) { return function_tables[libkey].row_major_cgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex *alpha, const std::complex **a, - std::int64_t *lda, const std::complex **b, std::int64_t *ldb, - std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex* alpha, const std::complex** a, + std::int64_t* lda, const std::complex** b, std::int64_t* ldb, + std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - sycl::half *alpha, const sycl::half **a, std::int64_t *lda, - const sycl::half **b, std::int64_t *ldb, sycl::half *beta, sycl::half **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + sycl::half* alpha, const sycl::half** a, std::int64_t* lda, + const sycl::half** b, std::int64_t* ldb, sycl::half* beta, sycl::half** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_hgemm_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const sycl::half **a, std::int64_t *lda, const sycl::half **b, - std::int64_t *ldb, float *beta, float **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const sycl::half** a, std::int64_t* lda, const sycl::half** b, + std::int64_t* ldb, float* beta, float** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_f16f16f32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8s8f32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *transa, - transpose *transb, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float *alpha, const std::int8_t **a, std::int64_t *lda, - const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies) { +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* transa, + transpose* transb, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float* alpha, const std::int8_t** a, std::int64_t* lda, + const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8s8s32_batch_group_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, group_count, group_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, - const float *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, + const float* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_sgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, - const double *b, std::int64_t ldb, std::int64_t stride_b, double beta, - double *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, + const double* b, std::int64_t ldb, std::int64_t stride_b, double beta, + double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_dgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_cgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, const std::complex *b, std::int64_t ldb, - std::int64_t stride_b, std::complex beta, std::complex *c, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, const std::complex* b, std::int64_t ldb, + std::int64_t stride_b, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::half alpha, const sycl::half *a, std::int64_t lda, - std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + sycl::half alpha, const sycl::half* a, std::int64_t lda, + std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_hgemm_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const sycl::half *a, std::int64_t lda, std::int64_t stride_a, - const sycl::half *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const sycl::half* a, std::int64_t lda, std::int64_t stride_a, + const sycl::half* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_f16f16f32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8s8f32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - float alpha, const std::int8_t *a, std::int64_t lda, std::int64_t stride_a, - const std::int8_t *b, std::int64_t ldb, std::int64_t stride_b, float beta, - std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + float alpha, const std::int8_t* a, std::int64_t lda, std::int64_t stride_a, + const std::int8_t* b, std::int64_t ldb, std::int64_t stride_b, float beta, + std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8s8s32_batch_strided_usm_sycl( queue, transa, transb, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c, batch_size, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, std::int64_t ldb, float beta, - float *c, std::int64_t ldc, const std::vector &dependencies) { + const float* a, std::int64_t lda, const float* b, std::int64_t ldb, float beta, + float* c, std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_sgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, std::int64_t ldb, double beta, - double *c, std::int64_t ldc, const std::vector &dependencies) { + const double* a, std::int64_t lda, const double* b, std::int64_t ldb, double beta, + double* c, std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_dgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_cgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue &queue, uplo upper_lower, +sycl::event gemmt(oneapi::mkl::device libkey, sycl::queue& queue, uplo upper_lower, transpose transa, transpose transb, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies) { + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies) { return function_tables[libkey].row_major_zgemmt_usm_sycl(queue, upper_lower, transa, transb, n, k, alpha, a, lda, b, ldb, beta, c, ldc, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8u8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::int8_t *a, std::int64_t lda, - std::int8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::int8_t* a, std::int64_t lda, + std::int8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_s8s8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::int8_t *b, std::int64_t ldb, std::int8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::int8_t* b, std::int64_t ldb, std::int8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_u8s8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event gemm_bias(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, offset offsetc, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, const std::uint8_t *a, std::int64_t lda, - std::uint8_t ao, const std::uint8_t *b, std::int64_t ldb, std::uint8_t bo, - float beta, std::int32_t *c, std::int64_t ldc, const std::int32_t *co, - const std::vector &dependencies) { + std::int64_t k, float alpha, const std::uint8_t* a, std::int64_t lda, + std::uint8_t ao, const std::uint8_t* b, std::int64_t ldb, std::uint8_t bo, + float beta, std::int32_t* c, std::int64_t ldc, const std::int32_t* co, + const std::vector& dependencies) { return function_tables[libkey].row_major_gemm_u8u8s32_bias_usm_sycl( queue, transa, transb, offsetc, m, n, k, alpha, a, lda, ao, b, ldb, bo, beta, c, ldc, co, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, float *b, std::int64_t ldb, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_somatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, double *b, std::int64_t ldb, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_domatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_comatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_zomatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, a, lda, stride_a, b, ldb, stride_b, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_simatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies) { + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_dimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_cimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, + std::complex* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_zimatcopy_batch_strided_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, stride, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, std::int64_t stride_a, float beta, - const float *b, std::int64_t ldb, std::int64_t stride_b, float *c, + const float* a, std::int64_t lda, std::int64_t stride_a, float beta, + const float* b, std::int64_t ldb, std::int64_t stride_b, float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_somatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, std::int64_t stride_a, double beta, - const double *b, std::int64_t ldb, std::int64_t stride_b, double *c, + const double* a, std::int64_t lda, std::int64_t stride_a, double beta, + const double* b, std::int64_t ldb, std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].row_major_domatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_comatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, + std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies) { + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies) { return function_tables[libkey].row_major_zomatadd_batch_strided_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, stride_a, beta, b, ldb, stride_b, c, ldc, stride_c, batch_size, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - float *b, std::int64_t ldb, const std::vector &dependencies) { +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + float* b, std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].row_major_somatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies) { +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_domatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].row_major_comatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].row_major_zomatcopy_usm_sycl(queue, trans, m, n, alpha, a, lda, b, ldb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].row_major_somatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, std::int64_t stridea, double *b, std::int64_t ldb, - std::int64_t strideb, const std::vector &dependencies) { +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, std::int64_t stridea, double* b, std::int64_t ldb, + std::int64_t strideb, const std::vector& dependencies) { return function_tables[libkey].row_major_domatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - std::complex *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stridea, + std::complex* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].row_major_comatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event omatcopy2(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - std::complex *b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::int64_t stridea, + std::complex* b, std::int64_t ldb, std::int64_t strideb, + const std::vector& dependencies) { return function_tables[libkey].row_major_zomatcopy2_usm_sycl( queue, trans, m, n, alpha, a, lda, stridea, b, ldb, strideb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, float alpha, float *ab, std::int64_t lda, - std::int64_t ldb, const std::vector &dependencies) { +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, float alpha, float* ab, std::int64_t lda, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].row_major_simatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, - std::int64_t m, std::int64_t n, double alpha, double *ab, std::int64_t lda, - std::int64_t ldb, const std::vector &dependencies) { +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, + std::int64_t m, std::int64_t n, double alpha, double* ab, std::int64_t lda, + std::int64_t ldb, const std::vector& dependencies) { return function_tables[libkey].row_major_dimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_cimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue &queue, transpose trans, +sycl::event imatcopy(oneapi::mkl::device libkey, sycl::queue& queue, transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies) { + std::complex* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies) { return function_tables[libkey].row_major_zimatcopy_usm_sycl(queue, trans, m, n, alpha, ab, lda, ldb, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, std::int64_t ldb, float *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, std::int64_t ldb, float* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_somatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, - transpose transb, std::int64_t m, std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, std::int64_t ldb, double *c, - std::int64_t ldc, const std::vector &dependencies) { +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, + transpose transb, std::int64_t m, std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, std::int64_t ldb, double* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_domatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_comatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue &queue, transpose transa, +sycl::event omatadd(oneapi::mkl::device libkey, sycl::queue& queue, transpose transa, transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, std::complex *c, - std::int64_t ldc, const std::vector &dependencies) { + const std::complex* a, std::int64_t lda, std::complex beta, + const std::complex* b, std::int64_t ldb, std::complex* c, + std::int64_t ldc, const std::vector& dependencies) { return function_tables[libkey].row_major_zomatadd_usm_sycl( queue, transa, transb, m, n, alpha, a, lda, beta, b, ldb, c, ldc, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, const float **a, - std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, const float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].row_major_somatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, const double **a, - std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, const double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].row_major_domatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].row_major_comatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event omatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].row_major_zomatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, a, lda, b, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, float *alpha, float **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, float* alpha, float** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].row_major_simatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, double *alpha, double **ab, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, double* alpha, double** ab, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies) { return function_tables[libkey].row_major_dimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].row_major_cimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } -sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue &queue, transpose *trans, - std::int64_t *m, std::int64_t *n, std::complex *alpha, - std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies) { +sycl::event imatcopy_batch(oneapi::mkl::device libkey, sycl::queue& queue, transpose* trans, + std::int64_t* m, std::int64_t* n, std::complex* alpha, + std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies) { return function_tables[libkey].row_major_zimatcopy_batch_group_usm_sycl( queue, trans, m, n, alpha, ab, lda, ldb, group_count, groupsize, dependencies); } diff --git a/src/blas/function_table.hpp b/src/blas/function_table.hpp index a242fd0c0..c821a4a51 100644 --- a/src/blas/function_table.hpp +++ b/src/blas/function_table.hpp @@ -34,4940 +34,4940 @@ typedef struct { // Buffer APIs - void (*column_major_scasum_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_dzasum_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_sasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_dasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_saxpy_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); - void (*column_major_daxpy_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); - void (*column_major_caxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zaxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_saxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*column_major_scasum_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_dzasum_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_sasum_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_dasum_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_saxpy_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); + void (*column_major_daxpy_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); + void (*column_major_caxpy_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zaxpy_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_saxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_daxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*column_major_daxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_caxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, + void (*column_major_caxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_zaxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, + void (*column_major_zaxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_saxpby_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_daxpby_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_caxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, + void (*column_major_saxpby_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_daxpby_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_caxpby_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zaxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zaxpby_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_scopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_scopy_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); - void (*column_major_dcopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, + void (*column_major_dcopy_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); - void (*column_major_ccopy_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zcopy_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_scopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*column_major_ccopy_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zcopy_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_scopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_dcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*column_major_dcopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_ccopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, + void (*column_major_ccopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_zcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, + void (*column_major_zcopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_sdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*column_major_ddot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*column_major_dsdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*column_major_cdotc_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*column_major_zdotc_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*column_major_cdotu_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*column_major_zdotu_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*column_major_isamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_idamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_icamin_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_izamin_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_isamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_idamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_icamax_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_izamax_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_scnrm2_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_dznrm2_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*column_major_snrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_dnrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*column_major_srot_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, + void (*column_major_sdot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*column_major_ddot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*column_major_dsdot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*column_major_cdotc_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*column_major_zdotc_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*column_major_cdotu_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*column_major_zdotu_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*column_major_isamin_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_idamin_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_icamin_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_izamin_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_isamax_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_idamax_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_icamax_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_izamax_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_scnrm2_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_dznrm2_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*column_major_snrm2_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_dnrm2_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*column_major_srot_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s); - void (*column_major_drot_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, + void (*column_major_drot_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s); - void (*column_major_csrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, + void (*column_major_csrot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s); - void (*column_major_zdrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, + void (*column_major_zdrot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s); - void (*column_major_srotg_sycl)(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); - void (*column_major_drotg_sycl)(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); - void (*column_major_crotg_sycl)(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); - void (*column_major_zrotg_sycl)(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); - void (*column_major_srotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); - void (*column_major_drotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer ¶m); - void (*column_major_srotmg_sycl)(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, - float y1, sycl::buffer ¶m); - void (*column_major_drotmg_sycl)(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, - double y1, sycl::buffer ¶m); - void (*column_major_sscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx); - void (*column_major_dscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx); - void (*column_major_cscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_csscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_zscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_zdscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_sdsdot_sycl)(sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*column_major_sswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, + void (*column_major_srotg_sycl)(sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); + void (*column_major_drotg_sycl)(sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); + void (*column_major_crotg_sycl)(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, + sycl::buffer& c, + sycl::buffer, 1>& s); + void (*column_major_zrotg_sycl)(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, + sycl::buffer& c, + sycl::buffer, 1>& s); + void (*column_major_srotm_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param); + void (*column_major_drotm_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& param); + void (*column_major_srotmg_sycl)(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, + float y1, sycl::buffer& param); + void (*column_major_drotmg_sycl)(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, + double y1, sycl::buffer& param); + void (*column_major_sscal_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx); + void (*column_major_dscal_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx); + void (*column_major_cscal_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_csscal_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_zscal_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_zdscal_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_sdsdot_sycl)(sycl::queue& queue, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*column_major_sswap_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); - void (*column_major_dswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, + void (*column_major_dswap_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); - void (*column_major_cswap_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zswap_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_sgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_cswap_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zswap_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_sgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); - void (*column_major_dgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t ku, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); + void (*column_major_dgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - void (*column_major_cgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t ku, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); + void (*column_major_cgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_sgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_sgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_dgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_dgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_cgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_cgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_sgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_sgemv_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - float beta, sycl::buffer &y, + float beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_dgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_dgemv_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - double beta, sycl::buffer &y, + double beta, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); void (*column_major_cgemv_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, sycl::buffer, 1> &y, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); void (*column_major_zgemv_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, sycl::buffer, 1> &y, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*column_major_sdgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*column_major_sdgmm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); - void (*column_major_ddgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*column_major_ddgmm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); void (*column_major_cdgmm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); void (*column_major_zdgmm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); - void (*column_major_sger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*column_major_dger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*column_major_cgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + void (*column_major_sger_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*column_major_dger_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*column_major_cgerc_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_zgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_zgerc_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_cgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_cgeru_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_zgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_zgeru_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_chbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_chbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zhbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zhbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_chemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_chemv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zhemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zhemv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_cher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_cher_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_zher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_zher_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_cher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_cher2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_zher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_zher2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*column_major_chpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*column_major_chpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_zhpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_zhpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*column_major_chpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& y, std::int64_t incy); + void (*column_major_chpr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); - void (*column_major_zhpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); + void (*column_major_zhpr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &a); - void (*column_major_chpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& a); + void (*column_major_chpr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); - void (*column_major_zhpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); + void (*column_major_zhpr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); - void (*column_major_ssbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); + void (*column_major_ssbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_dsbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_dsbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_sspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_dspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*column_major_sspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); - void (*column_major_dspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a); - void (*column_major_sspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); - void (*column_major_dspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a); - void (*column_major_ssymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); - void (*column_major_dsymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - void (*column_major_ssyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); - void (*column_major_dsyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &a, std::int64_t lda); - void (*column_major_ssyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*column_major_dsyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, - std::int64_t incy, sycl::buffer &a, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_sspmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_dspmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*column_major_sspr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a); + void (*column_major_dspr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a); + void (*column_major_sspr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a); + void (*column_major_dspr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a); + void (*column_major_ssymv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); + void (*column_major_dsymv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); + void (*column_major_ssyr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a, std::int64_t lda); + void (*column_major_dsyr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& a, std::int64_t lda); + void (*column_major_ssyr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*column_major_dsyr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, + std::int64_t incy, sycl::buffer& a, std::int64_t lda); - void (*column_major_stbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_stbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*column_major_dtbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*column_major_dtbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); - void (*column_major_ctbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_ctbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_ztbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_ztbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_stbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_stbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*column_major_dtbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*column_major_dtbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); - void (*column_major_ctbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_ctbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_ztbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_ztbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_stpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_stpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*column_major_dtpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*column_major_dtpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*column_major_ctpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*column_major_ctpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_ztpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_ztpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_stpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_stpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*column_major_dtpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*column_major_dtpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*column_major_ctpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*column_major_ctpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_ztpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_ztpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*column_major_strmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*column_major_strmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*column_major_dtrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*column_major_dtrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*column_major_ctrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*column_major_ctrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*column_major_ztrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_ztrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*column_major_strsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_strsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*column_major_dtrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*column_major_dtrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*column_major_ctrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*column_major_ctrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*column_major_ztrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_ztrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*column_major_sgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_sgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); - void (*column_major_dgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc); + void (*column_major_dgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - double beta, sycl::buffer &c, std::int64_t ldc); - void (*column_major_cgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + double beta, sycl::buffer& c, std::int64_t ldc); + void (*column_major_cgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_hgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_hgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - sycl::half beta, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + sycl::half beta, sycl::buffer& c, std::int64_t ldc); - void (*column_major_gemm_f16f16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_gemm_f16f16f32_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc); - void (*column_major_gemm_bf16bf16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_gemm_bf16bf16f32_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, + sycl::buffer& a, std::int64_t lda, - sycl::buffer &b, + sycl::buffer& b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_chemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_chemm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zhemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zhemm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_cherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_cherk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer, 1> &a, + float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zherk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer, 1> &a, + double alpha, sycl::buffer, 1>& a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_cher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_cher2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - float beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + float beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*column_major_zher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_zher2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - double beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + double beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*column_major_ssymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*column_major_ssymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_dsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_dsymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_csymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_csymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zsymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_ssyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_ssyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &c, std::int64_t ldc); - void (*column_major_dsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc); + void (*column_major_dsyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - double beta, sycl::buffer &c, std::int64_t ldc); - void (*column_major_csyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + double alpha, sycl::buffer& a, std::int64_t lda, + double beta, sycl::buffer& c, std::int64_t ldc); + void (*column_major_csyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zsyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_ssyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_ssyrk_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*column_major_dsyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_dsyrk_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_csyrk_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_zsyrk_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*column_major_ssyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_ssyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_dsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_dsyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_csyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_csyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zsyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_strmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_strmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_dtrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_dtrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_ctrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_ctrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*column_major_ztrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*column_major_ztrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*column_major_strsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*column_major_strsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_dtrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_dtrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_ctrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_ctrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*column_major_ztrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*column_major_ztrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); void (*column_major_sgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_dgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, double beta, sycl::buffer &c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, double beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_cgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_zgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*column_major_hgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_hgemm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_gemm_f16f16f32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_gemm_s8s8f32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_gemm_s8s8s32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_strsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_dtrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_ctrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_ztrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - void (*column_major_sgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*column_major_sgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_dgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_dgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*column_major_cgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*column_major_cgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); void (*column_major_gemm_s8u8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*column_major_gemm_s8s8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*column_major_gemm_u8s8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*column_major_gemm_u8u8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*column_major_somatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_domatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_comatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*column_major_zomatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - void (*column_major_simatcopy_batch_strided_sycl)(sycl::queue &queue, + void (*column_major_simatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*column_major_dimatcopy_batch_strided_sycl)(sycl::queue &queue, + void (*column_major_dimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*column_major_cimatcopy_batch_strided_sycl)(sycl::queue &queue, + void (*column_major_cimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*column_major_zimatcopy_batch_strided_sycl)(sycl::queue &queue, + void (*column_major_zimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); void (*column_major_somatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_domatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_comatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*column_major_zomatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*column_major_somatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_somatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_domatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_domatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*column_major_comatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*column_major_comatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*column_major_zomatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*column_major_zomatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*column_major_somatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*column_major_somatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); - void (*column_major_domatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_domatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); - void (*column_major_comatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_comatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); - void (*column_major_zomatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_zomatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); - void (*column_major_simatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_simatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); - void (*column_major_dimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_dimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); - void (*column_major_cimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_cimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); - void (*column_major_zimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*column_major_zimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); - void (*column_major_somatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_somatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, float beta, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &c, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, float beta, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); - void (*column_major_domatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_domatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, double beta, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &c, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, double beta, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& c, std::int64_t ldc); - void (*column_major_comatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*column_major_comatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*column_major_zomatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*column_major_zomatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // USM APIs - sycl::event (*column_major_scasum_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies); - sycl::event (*column_major_dzasum_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies); - sycl::event (*column_major_sasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies); - sycl::event (*column_major_dasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies); - sycl::event (*column_major_saxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, + sycl::event (*column_major_scasum_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + float* result, + const std::vector& dependencies); + sycl::event (*column_major_dzasum_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + double* result, + const std::vector& dependencies); + sycl::event (*column_major_sasum_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies); + sycl::event (*column_major_dasum_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies); + sycl::event (*column_major_saxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_daxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, + const std::vector& dependencies); + sycl::event (*column_major_daxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_caxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, + const std::vector& dependencies); + sycl::event (*column_major_caxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zaxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_zaxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*column_major_saxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, - float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, float* alpha, const float** x, std::int64_t* incx, + float** y, std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_daxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, - double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, double* alpha, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_caxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_zaxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_saxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, float alpha, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_daxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, double alpha, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_caxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, std::int64_t incy, + sycl::queue& queue, std::int64_t n, std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_zaxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); - sycl::event (*column_major_saxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, - float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_daxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, - const double beta, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_caxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies); + sycl::event (*column_major_saxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float beta, + float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_daxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, + const double beta, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_caxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, + const std::complex* x, std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zaxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_zaxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, + const std::complex* x, std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_scopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_ccopy_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_scopy_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_dcopy_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_ccopy_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_zcopy_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*column_major_scopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_dcopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_ccopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_zcopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_scopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t stridex, - float *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*column_major_dcopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_ccopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_zcopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); - sycl::event (*column_major_sdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *result, - const std::vector &dependencies); - sycl::event (*column_major_ddot_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *result, - const std::vector &dependencies); - sycl::event (*column_major_dsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - double *result, - const std::vector &dependencies); - sycl::event (*column_major_cdotc_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*column_major_zdotc_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*column_major_cdotu_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*column_major_zdotu_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*column_major_isamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_idamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_icamin_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_izamin_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_isamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_idamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_icamax_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_izamax_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*column_major_scnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies); - sycl::event (*column_major_dznrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies); - sycl::event (*column_major_snrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies); - sycl::event (*column_major_dnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies); - sycl::event (*column_major_srot_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, float c, + sycl::queue& queue, std::int64_t n, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); + sycl::event (*column_major_sdot_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* result, + const std::vector& dependencies); + sycl::event (*column_major_ddot_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, + double* result, + const std::vector& dependencies); + sycl::event (*column_major_dsdot_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + double* result, + const std::vector& dependencies); + sycl::event (*column_major_cdotc_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*column_major_zdotc_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*column_major_cdotu_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*column_major_zdotu_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*column_major_isamin_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_idamin_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_icamin_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_izamin_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_isamax_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_idamax_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_icamax_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_izamax_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*column_major_scnrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + float* result, + const std::vector& dependencies); + sycl::event (*column_major_dznrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + double* result, + const std::vector& dependencies); + sycl::event (*column_major_snrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies); + sycl::event (*column_major_dnrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies); + sycl::event (*column_major_srot_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, float c, float s, - const std::vector &dependencies); - sycl::event (*column_major_drot_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, double c, + const std::vector& dependencies); + sycl::event (*column_major_drot_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, double c, double s, - const std::vector &dependencies); - sycl::event (*column_major_csrot_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_csrot_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, float s, - const std::vector &dependencies); - sycl::event (*column_major_zdrot_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_zdrot_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, - const std::vector &dependencies); - sycl::event (*column_major_srotg_usm_sycl)(sycl::queue &queue, float *a, float *b, float *c, - float *s, - const std::vector &dependencies); - sycl::event (*column_major_drotg_usm_sycl)(sycl::queue &queue, double *a, double *b, double *c, - double *s, - const std::vector &dependencies); - sycl::event (*column_major_crotg_usm_sycl)(sycl::queue &queue, std::complex *a, - std::complex *b, float *c, - std::complex *s, - const std::vector &dependencies); - sycl::event (*column_major_zrotg_usm_sycl)(sycl::queue &queue, std::complex *a, - std::complex *b, double *c, - std::complex *s, - const std::vector &dependencies); - sycl::event (*column_major_srotm_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - float *param, - const std::vector &dependencies); - sycl::event (*column_major_drotm_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - double *param, - const std::vector &dependencies); - sycl::event (*column_major_srotmg_usm_sycl)(sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, - const std::vector &dependencies); - sycl::event (*column_major_drotmg_usm_sycl)(sycl::queue &queue, double *d1, double *d2, - double *x1, double y1, double *param, - const std::vector &dependencies); - sycl::event (*column_major_sscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_cscal_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, + const std::vector& dependencies); + sycl::event (*column_major_srotg_usm_sycl)(sycl::queue& queue, float* a, float* b, float* c, + float* s, + const std::vector& dependencies); + sycl::event (*column_major_drotg_usm_sycl)(sycl::queue& queue, double* a, double* b, double* c, + double* s, + const std::vector& dependencies); + sycl::event (*column_major_crotg_usm_sycl)(sycl::queue& queue, std::complex* a, + std::complex* b, float* c, + std::complex* s, + const std::vector& dependencies); + sycl::event (*column_major_zrotg_usm_sycl)(sycl::queue& queue, std::complex* a, + std::complex* b, double* c, + std::complex* s, + const std::vector& dependencies); + sycl::event (*column_major_srotm_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + float* param, + const std::vector& dependencies); + sycl::event (*column_major_drotm_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + double* param, + const std::vector& dependencies); + sycl::event (*column_major_srotmg_usm_sycl)(sycl::queue& queue, float* d1, float* d2, float* x1, + float y1, float* param, + const std::vector& dependencies); + sycl::event (*column_major_drotmg_usm_sycl)(sycl::queue& queue, double* d1, double* d2, + double* x1, double y1, double* param, + const std::vector& dependencies); + sycl::event (*column_major_sscal_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_dscal_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_cscal_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_csscal_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, + const std::vector& dependencies); + sycl::event (*column_major_csscal_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_zscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_zdscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_sdsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies); - sycl::event (*column_major_sswap_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dswap_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_cswap_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zswap_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_sgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_zscal_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_zdscal_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_sdsdot_usm_sycl)(sycl::queue& queue, std::int64_t n, float sb, + const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* result, + const std::vector& dependencies); + sycl::event (*column_major_sswap_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_dswap_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_cswap_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_zswap_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_sgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_dgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies); + std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, + double beta, double* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*column_major_cgbmv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); - sycl::event (*column_major_zgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); + sycl::event (*column_major_zgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_sgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_sgemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, + const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_dgemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_cgemv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); sycl::event (*column_major_zgemv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); sycl::event (*column_major_sgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_dgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_cgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_zgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex beta, std::complex *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_sgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, - float *beta, float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, const float** x, std::int64_t* incx, + float* beta, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_dgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, - double *beta, double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, const double** x, std::int64_t* incx, + double* beta, double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_cgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_zgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_sdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const float* a, std::int64_t lda, std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_ddgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const double* a, std::int64_t lda, std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_cdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*column_major_zdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*column_major_sdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const float** a, std::int64_t* lda, const float** x, std::int64_t* incx, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_ddgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const double** a, std::int64_t* lda, const double** x, std::int64_t* incx, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_cdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_zdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); - sycl::event (*column_major_sger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); + sycl::event (*column_major_sger_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_dger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, + const std::vector& dependencies); + sycl::event (*column_major_dger_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_cgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::vector& dependencies); + sycl::event (*column_major_cgerc_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_zgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_zgerc_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_cgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_cgeru_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_zgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_zgeru_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); sycl::event (*column_major_chbmv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); sycl::event (*column_major_zhbmv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); - sycl::event (*column_major_chemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); + sycl::event (*column_major_chemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zhemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_zhemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_cher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_cher_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_zher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_zher_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_cher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_cher2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_zher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_zher2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_chpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_chpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_zhpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_zhpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_chpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_chpr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies); - sycl::event (*column_major_zhpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies); + sycl::event (*column_major_zhpr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies); - sycl::event (*column_major_chpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies); + sycl::event (*column_major_chpr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies); - sycl::event (*column_major_zhpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies); + sycl::event (*column_major_zhpr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies); - sycl::event (*column_major_ssbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies); + sycl::event (*column_major_ssbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, + const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dsbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_dsbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_sspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, - const float *x, std::int64_t incx, float beta, - float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, - const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_sspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, - const std::vector &dependencies); - sycl::event (*column_major_dspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, - const std::vector &dependencies); - sycl::event (*column_major_sspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *a, - const std::vector &dependencies); - sycl::event (*column_major_dspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, - std::int64_t incy, double *a, - const std::vector &dependencies); - sycl::event (*column_major_ssymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_dsymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*column_major_ssyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_dsyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_ssyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_dsyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, - std::int64_t incy, double *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*column_major_stbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_sspmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* a, + const float* x, std::int64_t incx, float beta, + float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_dspmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* a, + const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_sspr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, + const std::vector& dependencies); + sycl::event (*column_major_dspr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, + const std::vector& dependencies); + sycl::event (*column_major_sspr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* a, + const std::vector& dependencies); + sycl::event (*column_major_dspr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, + std::int64_t incy, double* a, + const std::vector& dependencies); + sycl::event (*column_major_ssymv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_dsymv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, + double beta, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*column_major_ssyr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_dsyr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_ssyr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_dsyr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, + std::int64_t incy, double* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*column_major_stbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_dtbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ctbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ztbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_stbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_stbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_dtbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ctbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ztbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_stpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_stpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_dtpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ctpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ztpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_stpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_stpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_dtpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ctpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ztpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_strmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_strmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, + const float* a, std::int64_t lda, float* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_dtrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, + const double* a, std::int64_t lda, double* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ctrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ztrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_strsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_strsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, + const float* a, std::int64_t lda, float* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_dtrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_dtrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, + const double* a, std::int64_t lda, double* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ctrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ctrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_ztrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_ztrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*column_major_sgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*column_major_sgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, + const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_dgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::vector& dependencies); + sycl::event (*column_major_dgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, + const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_cgemm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_zgemm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_hgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_hgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, - const sycl::half *b, std::int64_t ldb, - sycl::half beta, sycl::half *c, std::int64_t ldc, - const std::vector &dependencies); + const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, + sycl::half beta, sycl::half* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_gemm_f16f16f32_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies); sycl::event (*column_major_gemm_bf16bf16f32_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16 *a, - std::int64_t lda, const oneapi::mkl::bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16* a, + std::int64_t lda, const oneapi::mkl::bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies); sycl::event (*column_major_chemm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*column_major_zhemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*column_major_zhemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_cherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_cherk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - const std::complex *a, std::int64_t lda, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_zherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_zherk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - const std::complex *a, std::int64_t lda, - double beta, std::complex *c, + const std::complex* a, std::int64_t lda, + double beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_cher2k_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, float beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, float beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); sycl::event (*column_major_zher2k_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, double beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*column_major_ssymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, double beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*column_major_ssymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_dsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_dsymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_csymm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*column_major_zsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*column_major_zsymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_ssyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ssyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, float beta, float *c, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, float beta, float* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_dsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_dsyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, + std::int64_t k, double alpha, const double* a, + std::int64_t lda, double beta, double* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_csyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_csyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_zsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_zsyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_ssyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, - float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, float* alpha, const float** a, std::int64_t* lda, + float* beta, float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_dsyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, - double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, double* alpha, const double** a, std::int64_t* lda, + double* beta, double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_csyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_zsyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_ssyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_dsyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double beta, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_csyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex *c, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_zsyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex *c, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); - sycl::event (*column_major_ssyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*column_major_ssyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_dsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_dsyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_csyr2k_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*column_major_zsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*column_major_zsyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_strmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + const std::vector& dependencies); + sycl::event (*column_major_strmm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_dtrmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*column_major_dtrmm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies); sycl::event (*column_major_ctrmm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*column_major_ztrmm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); - sycl::event (*column_major_strsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); + sycl::event (*column_major_strsm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_dtrsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*column_major_dtrsm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies); sycl::event (*column_major_ctrsm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*column_major_ztrsm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*column_major_strsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_dtrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_ctrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_ztrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_strsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_dtrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_ctrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_ztrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*column_major_sgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, const float** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_dgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, const double** b, std::int64_t* ldb, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_cgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_zgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_hgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, sycl::half* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, sycl::half* beta, + sycl::half** c, std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_gemm_f16f16f32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_gemm_s8s8f32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_gemm_s8s8s32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*column_major_sgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_dgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb, - std::int64_t stride_b, double beta, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, std::int64_t ldb, + std::int64_t stride_b, double beta, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_cgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_zgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_hgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_gemm_f16f16f32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_gemm_s8s8f32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_gemm_s8s8s32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); - sycl::event (*column_major_sgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); + sycl::event (*column_major_sgemmt_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_dgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_dgemmt_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_cgemmt_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_zgemmt_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_gemm_s8u8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*column_major_gemm_s8s8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*column_major_gemm_u8s8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*column_major_gemm_u8u8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*column_major_somatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_domatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*column_major_comatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_zomatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_simatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, float *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_dimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, double *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_cimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, std::int64_t lda, std::int64_t ldb, - std::int64_t stride, std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_zimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, std::int64_t lda, std::int64_t ldb, - std::int64_t stride, std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_somatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float beta, const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*column_major_domatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double beta, const double *b, std::int64_t ldb, - std::int64_t stride_b, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, const double* b, std::int64_t ldb, + std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*column_major_comatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*column_major_zomatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); - sycl::event (*column_major_somatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::event (*column_major_somatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_domatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_domatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_comatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_comatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_zomatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*column_major_zomatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies); sycl::event (*column_major_somatcopy2_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stridea, float *b, - std::int64_t ldb, std::int64_t strideb, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stridea, float* b, + std::int64_t ldb, std::int64_t strideb, const std::vector& dependencies); sycl::event (*column_major_domatcopy2_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stridea, double *b, - std::int64_t ldb, std::int64_t strideb, const std::vector &dependencies); - sycl::event (*column_major_comatcopy2_usm_sycl)(sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stridea, double* b, + std::int64_t ldb, std::int64_t strideb, const std::vector& dependencies); + sycl::event (*column_major_comatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*column_major_zomatcopy2_usm_sycl)(sycl::queue &queue, + const std::vector& dependencies); + sycl::event (*column_major_zomatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*column_major_simatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_simatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_dimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*column_major_dimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_cimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*column_major_cimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_zimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*column_major_zimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*column_major_somatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::vector& dependencies); + sycl::event (*column_major_somatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, - std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*column_major_domatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, + std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*column_major_domatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, - std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, + std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_comatadd_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*column_major_zomatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*column_major_zomatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, + const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*column_major_somatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*column_major_domatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*column_major_comatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*column_major_zomatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*column_major_simatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, float **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*column_major_dimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, double **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*column_major_cimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*column_major_zimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, std::complex **ab, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, std::complex** ab, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); // Buffer APIs - void (*row_major_scasum_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_dzasum_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_sasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_dasum_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_saxpy_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); - void (*row_major_daxpy_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy); - void (*row_major_caxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_zaxpy_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_saxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*row_major_scasum_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_dzasum_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_sasum_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_dasum_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_saxpy_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); + void (*row_major_daxpy_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy); + void (*row_major_caxpy_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_zaxpy_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_saxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_daxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*row_major_daxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_caxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, + void (*row_major_caxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_zaxpy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, + void (*row_major_zaxpy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_saxpby_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_daxpby_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_caxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + void (*row_major_saxpby_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_daxpby_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_caxpby_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); - void (*row_major_zaxpby_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, + void (*row_major_zaxpby_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_scopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); - void (*row_major_dcopy_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); - void (*row_major_ccopy_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_zcopy_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_scopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_scopy_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); + void (*row_major_dcopy_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); + void (*row_major_ccopy_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_zcopy_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_scopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_dcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer &x, std::int64_t incx, - std::int64_t stridex, sycl::buffer &y, + void (*row_major_dcopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer& x, std::int64_t incx, + std::int64_t stridex, sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_ccopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, + void (*row_major_ccopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_zcopy_batch_strided_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, + void (*row_major_zcopy_batch_strided_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &y, + sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_sdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*row_major_ddot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*row_major_dsdot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*row_major_cdotc_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*row_major_zdotc_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*row_major_cdotu_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*row_major_zdotu_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &result); - void (*row_major_isamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_idamin_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_icamin_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_izamin_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_isamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_idamax_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_icamax_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_izamax_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_scnrm2_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_dznrm2_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer &result); - void (*row_major_snrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_dnrm2_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &result); - void (*row_major_srot_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, float c, + void (*row_major_sdot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*row_major_ddot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*row_major_dsdot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*row_major_cdotc_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*row_major_zdotc_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*row_major_cdotu_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*row_major_zdotu_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& result); + void (*row_major_isamin_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_idamin_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_icamin_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_izamin_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_isamax_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_idamax_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_icamax_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_izamax_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_scnrm2_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_dznrm2_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer& result); + void (*row_major_snrm2_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_dnrm2_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& result); + void (*row_major_srot_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, float c, float s); - void (*row_major_drot_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, + void (*row_major_drot_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, double c, double s); - void (*row_major_csrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, + void (*row_major_csrot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, float c, float s); - void (*row_major_zdrot_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, + void (*row_major_zdrot_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, double c, double s); - void (*row_major_srotg_sycl)(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); - void (*row_major_drotg_sycl)(sycl::queue &queue, sycl::buffer &a, - sycl::buffer &b, sycl::buffer &c, - sycl::buffer &s); - void (*row_major_crotg_sycl)(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, sycl::buffer &c, - sycl::buffer, 1> &s); - void (*row_major_zrotg_sycl)(sycl::queue &queue, sycl::buffer, 1> &a, - sycl::buffer, 1> &b, - sycl::buffer &c, - sycl::buffer, 1> &s); - void (*row_major_srotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); - void (*row_major_drotm_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy, - sycl::buffer ¶m); - void (*row_major_srotmg_sycl)(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, float y1, - sycl::buffer ¶m); - void (*row_major_drotmg_sycl)(sycl::queue &queue, sycl::buffer &d1, - sycl::buffer &d2, sycl::buffer &x1, - double y1, sycl::buffer ¶m); - void (*row_major_sscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx); - void (*row_major_dscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx); - void (*row_major_cscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_csscal_sycl)(sycl::queue &queue, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_zscal_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_zdscal_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_sdsdot_sycl)(sycl::queue &queue, std::int64_t n, float sb, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &result); - void (*row_major_sswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); - void (*row_major_dswap_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &x, - std::int64_t incx, sycl::buffer &y, std::int64_t incy); - void (*row_major_cswap_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_zswap_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_sgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + void (*row_major_srotg_sycl)(sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); + void (*row_major_drotg_sycl)(sycl::queue& queue, sycl::buffer& a, + sycl::buffer& b, sycl::buffer& c, + sycl::buffer& s); + void (*row_major_crotg_sycl)(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, sycl::buffer& c, + sycl::buffer, 1>& s); + void (*row_major_zrotg_sycl)(sycl::queue& queue, sycl::buffer, 1>& a, + sycl::buffer, 1>& b, + sycl::buffer& c, + sycl::buffer, 1>& s); + void (*row_major_srotm_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param); + void (*row_major_drotm_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy, + sycl::buffer& param); + void (*row_major_srotmg_sycl)(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, float y1, + sycl::buffer& param); + void (*row_major_drotmg_sycl)(sycl::queue& queue, sycl::buffer& d1, + sycl::buffer& d2, sycl::buffer& x1, + double y1, sycl::buffer& param); + void (*row_major_sscal_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx); + void (*row_major_dscal_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx); + void (*row_major_cscal_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_csscal_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_zscal_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_zdscal_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_sdsdot_sycl)(sycl::queue& queue, std::int64_t n, float sb, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& result); + void (*row_major_sswap_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); + void (*row_major_dswap_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& x, + std::int64_t incx, sycl::buffer& y, std::int64_t incy); + void (*row_major_cswap_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_zswap_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_sgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_dgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_dgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_cgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_cgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_zgbmv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_zgbmv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_sgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); - void (*row_major_dgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - void (*row_major_cgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_sgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); + void (*row_major_dgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); + void (*row_major_cgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); - void (*row_major_zgemv_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, + void (*row_major_zgemv_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_sgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_sgemv_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, float beta, - sycl::buffer &y, std::int64_t incy, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_dgemv_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_dgemv_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, double beta, - sycl::buffer &y, std::int64_t incy, + sycl::buffer& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); void (*row_major_cgemv_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, sycl::buffer, 1> &y, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); void (*row_major_zgemv_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &x, std::int64_t incx, - std::int64_t stridex, std::complex beta, sycl::buffer, 1> &y, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& x, std::int64_t incx, + std::int64_t stridex, std::complex beta, sycl::buffer, 1>& y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size); - void (*row_major_sdgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_sdgmm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); - void (*row_major_ddgmm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_ddgmm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &x, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& x, std::int64_t incx, std::int64_t stridex, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); void (*row_major_cdgmm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); void (*row_major_zdgmm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &x, std::int64_t incx, std::int64_t stridex, - sycl::buffer, 1> &c, std::int64_t ldc, std::int64_t stridec, + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, + sycl::buffer, 1>& x, std::int64_t incx, std::int64_t stridex, + sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size); - void (*row_major_sger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*row_major_dger_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*row_major_cgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + void (*row_major_sger_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*row_major_dger_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double alpha, + sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*row_major_cgerc_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); - void (*row_major_zgerc_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + void (*row_major_zgerc_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*row_major_cgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*row_major_cgeru_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); - void (*row_major_zgeru_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + void (*row_major_zgeru_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*row_major_chbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*row_major_chbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); - void (*row_major_zhbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + void (*row_major_zhbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_chemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_chemv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_zhemv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_zhemv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_cher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_cher_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); - void (*row_major_zher_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a, + void (*row_major_zher_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a, std::int64_t lda); - void (*row_major_cher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a, + void (*row_major_cher2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a, std::int64_t lda); - void (*row_major_zher2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + void (*row_major_zher2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a, std::int64_t lda); - void (*row_major_chpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, - std::complex beta, sycl::buffer, 1> &y, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a, std::int64_t lda); + void (*row_major_chpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, + std::complex beta, sycl::buffer, 1>& y, std::int64_t incy); - void (*row_major_zhpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + void (*row_major_zhpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx, + sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx, std::complex beta, - sycl::buffer, 1> &y, std::int64_t incy); - void (*row_major_chpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); - void (*row_major_zhpr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &a); - void (*row_major_chpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &x, - std::int64_t incx, sycl::buffer, 1> &y, - std::int64_t incy, sycl::buffer, 1> &a); - void (*row_major_zhpr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + sycl::buffer, 1>& y, std::int64_t incy); + void (*row_major_chpr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); + void (*row_major_zhpr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& a); + void (*row_major_chpr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& x, + std::int64_t incx, sycl::buffer, 1>& y, + std::int64_t incy, sycl::buffer, 1>& a); + void (*row_major_zhpr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &x, std::int64_t incx, - sycl::buffer, 1> &y, std::int64_t incy, - sycl::buffer, 1> &a); - void (*row_major_ssbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - float beta, sycl::buffer &y, std::int64_t incy); - void (*row_major_dsbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx, - double beta, sycl::buffer &y, std::int64_t incy); - void (*row_major_sspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, sycl::buffer &x, - std::int64_t incx, float beta, sycl::buffer &y, + sycl::buffer, 1>& x, std::int64_t incx, + sycl::buffer, 1>& y, std::int64_t incy, + sycl::buffer, 1>& a); + void (*row_major_ssbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + float beta, sycl::buffer& y, std::int64_t incy); + void (*row_major_dsbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx, + double beta, sycl::buffer& y, std::int64_t incy); + void (*row_major_sspmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, sycl::buffer& x, + std::int64_t incx, float beta, sycl::buffer& y, std::int64_t incy); - void (*row_major_dspmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_sspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a); - void (*row_major_dspr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a); - void (*row_major_sspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); - void (*row_major_dspr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a); - void (*row_major_ssymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, float beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_dsymv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx, double beta, - sycl::buffer &y, std::int64_t incy); - void (*row_major_ssyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda); - void (*row_major_dsyr_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &a, std::int64_t lda); - void (*row_major_ssyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - float alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*row_major_dsyr2_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, - double alpha, sycl::buffer &x, std::int64_t incx, - sycl::buffer &y, std::int64_t incy, - sycl::buffer &a, std::int64_t lda); - void (*row_major_stbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_dspmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_sspr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); + void (*row_major_dspr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a); + void (*row_major_sspr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a); + void (*row_major_dspr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a); + void (*row_major_ssymv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, float beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_dsymv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx, double beta, + sycl::buffer& y, std::int64_t incy); + void (*row_major_ssyr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); + void (*row_major_dsyr_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& a, std::int64_t lda); + void (*row_major_ssyr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + float alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*row_major_dsyr2_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, + double alpha, sycl::buffer& x, std::int64_t incx, + sycl::buffer& y, std::int64_t incy, + sycl::buffer& a, std::int64_t lda); + void (*row_major_stbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*row_major_dtbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*row_major_dtbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*row_major_ctbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*row_major_ctbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_ztbmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_ztbmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_stbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_stbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*row_major_dtbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*row_major_dtbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &x, std::int64_t incx); - void (*row_major_ctbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& x, std::int64_t incx); + void (*row_major_ctbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_ztbsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_ztbsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, std::int64_t k, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_stpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_stpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*row_major_dtpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*row_major_dtpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*row_major_ctpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*row_major_ctpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_ztpmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_ztpmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_stpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_stpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*row_major_dtpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*row_major_dtpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, - sycl::buffer &x, std::int64_t incx); - void (*row_major_ctpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, + sycl::buffer& x, std::int64_t incx); + void (*row_major_ctpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_ztpsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_ztpsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - sycl::buffer, 1> &x, std::int64_t incx); - void (*row_major_strmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer, 1>& a, + sycl::buffer, 1>& x, std::int64_t incx); + void (*row_major_strmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*row_major_dtrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*row_major_dtrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*row_major_ctrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*row_major_ctrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*row_major_ztrmv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_ztrmv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*row_major_strsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_strsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*row_major_dtrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*row_major_dtrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &x, std::int64_t incx); - void (*row_major_ctrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& x, std::int64_t incx); + void (*row_major_ctrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*row_major_ztrsv_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_ztrsv_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, - std::int64_t n, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &x, + std::int64_t n, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& x, std::int64_t incx); - void (*row_major_sgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*row_major_sgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_dgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_dgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, double alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - double beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_cgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t k, double alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + double beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_cgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*row_major_zgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*row_major_zgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_hgemm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_hgemm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::half alpha, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::half beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_gemm_f16f16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t k, sycl::half alpha, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::half beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_gemm_f16f16f32_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, - float beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_gemm_bf16bf16f32_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, + float beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_gemm_bf16bf16f32_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, + sycl::buffer& a, std::int64_t lda, - sycl::buffer &b, - std::int64_t ldb, float beta, sycl::buffer &c, + sycl::buffer& b, + std::int64_t ldb, float beta, sycl::buffer& c, std::int64_t ldc); - void (*row_major_chemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_chemm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_zhemm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_zhemm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_cherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_cherk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer, 1> &a, + float alpha, sycl::buffer, 1>& a, std::int64_t lda, float beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_zherk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_zherk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer, 1> &a, + double alpha, sycl::buffer, 1>& a, std::int64_t lda, double beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_cher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_cher2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - float beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + float beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*row_major_zher2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_zher2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - double beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + double beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*row_major_ssymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_ssymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_dsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_dsymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_csymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_csymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_zsymm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_zsymm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_ssyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_ssyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_dsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_dsyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - double beta, sycl::buffer &c, std::int64_t ldc); - void (*row_major_csyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + double alpha, sycl::buffer& a, std::int64_t lda, + double beta, sycl::buffer& c, std::int64_t ldc); + void (*row_major_csyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - std::complex alpha, sycl::buffer, 1> &a, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_zsyrk_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_zsyrk_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_ssyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_ssyrk_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*row_major_dsyrk_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_dsyrk_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, double beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_csyrk_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_zsyrk_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*row_major_ssyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_ssyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_dsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_dsyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_csyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_csyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*row_major_zsyr2k_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_zsyr2k_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_strmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_strmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_dtrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_dtrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_ctrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_ctrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); - void (*row_major_ztrmm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_ztrmm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*row_major_strsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*row_major_strsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_dtrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + float alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_dtrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_ctrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + double alpha, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_ctrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, - std::int64_t lda, sycl::buffer, 1> &b, + std::complex alpha, sycl::buffer, 1>& a, + std::int64_t lda, sycl::buffer, 1>& b, std::int64_t ldb); - void (*row_major_ztrsm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + void (*row_major_ztrsm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*row_major_sgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*row_major_sgemm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*row_major_dgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*row_major_dgemm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, double beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_cgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_zgemm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer, 1> &b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*row_major_hgemm_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*row_major_hgemm_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, sycl::half beta, - sycl::buffer &c, std::int64_t ldc, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_gemm_f16f16f32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, float beta, sycl::buffer &c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, float beta, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_gemm_s8s8f32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_gemm_s8s8s32_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, float beta, - sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, float beta, + sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_strsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + float alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*row_major_dtrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, + double alpha, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*row_major_ctrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*row_major_ztrsm_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - void (*row_major_sgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_sgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, float beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_dgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, float beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_dgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, double beta, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_cgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, double beta, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_cgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, - std::complex beta, sycl::buffer, 1> &c, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, + std::complex beta, sycl::buffer, 1>& c, std::int64_t ldc); - void (*row_major_zgemmt_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + void (*row_major_zgemmt_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb, std::complex beta, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& c, std::int64_t ldc); void (*row_major_gemm_s8u8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*row_major_gemm_s8s8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, int8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, int8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*row_major_gemm_u8s8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, int8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); + sycl::buffer& a, std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, int8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); void (*row_major_gemm_u8u8s32_bias_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - sycl::buffer &a, std::int64_t lda, uint8_t ao, sycl::buffer &b, - std::int64_t ldb, uint8_t bo, float beta, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &co); - void (*row_major_somatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, uint8_t ao, sycl::buffer& b, + std::int64_t ldb, uint8_t bo, float beta, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& co); + void (*row_major_somatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - void (*row_major_domatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_domatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*row_major_comatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); void (*row_major_zomatcopy_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer, 1> &b, std::int64_t ldb, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - void (*row_major_simatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_simatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*row_major_dimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_dimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*row_major_cimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_cimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); - void (*row_major_zimatcopy_batch_strided_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_zimatcopy_batch_strided_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, std::int64_t batch_size); void (*row_major_somatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, float alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, float beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, float alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, float beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_domatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, double alpha, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, double beta, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer &c, std::int64_t ldc, std::int64_t stride_c, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, double alpha, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, double beta, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_comatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); void (*row_major_zomatadd_batch_strided_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, std::int64_t stride_a, - std::complex beta, sycl::buffer, 1> &b, std::int64_t ldb, - std::int64_t stride_b, sycl::buffer, 1> &c, std::int64_t ldc, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stride_a, + std::complex beta, sycl::buffer, 1>& b, std::int64_t ldb, + std::int64_t stride_b, sycl::buffer, 1>& c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size); - void (*row_major_somatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_somatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_domatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_domatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb); - void (*row_major_comatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb); + void (*row_major_comatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*row_major_zomatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*row_major_zomatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - sycl::buffer, 1> &b, std::int64_t ldb); - void (*row_major_somatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::buffer, 1>& a, std::int64_t lda, + sycl::buffer, 1>& b, std::int64_t ldb); + void (*row_major_somatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); - void (*row_major_domatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_domatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer &b, + sycl::buffer& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer& b, std::int64_t ldb, std::int64_t strideb); - void (*row_major_comatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_comatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, - std::int64_t stridea, sycl::buffer, 1> &b, + sycl::buffer, 1>& a, std::int64_t lda, + std::int64_t stridea, sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); - void (*row_major_zomatcopy2_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_zomatcopy2_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::int64_t stridea, - sycl::buffer, 1> &b, std::int64_t ldb, + sycl::buffer, 1>& b, std::int64_t ldb, std::int64_t strideb); - void (*row_major_simatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_simatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); - void (*row_major_dimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_dimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - sycl::buffer &ab, std::int64_t lda, + sycl::buffer& ab, std::int64_t lda, std::int64_t ldb); - void (*row_major_cimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_cimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); - void (*row_major_zimatcopy_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + void (*row_major_zimatcopy_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &ab, std::int64_t lda, + sycl::buffer, 1>& ab, std::int64_t lda, std::int64_t ldb); - void (*row_major_somatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + void (*row_major_somatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - float alpha, sycl::buffer &a, std::int64_t lda, - float beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_domatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + float alpha, sycl::buffer& a, std::int64_t lda, + float beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_domatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, - double alpha, sycl::buffer &a, std::int64_t lda, - double beta, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &c, std::int64_t ldc); - void (*row_major_comatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + double alpha, sycl::buffer& a, std::int64_t lda, + double beta, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& c, std::int64_t ldc); + void (*row_major_comatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); - void (*row_major_zomatadd_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); + void (*row_major_zomatadd_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - sycl::buffer, 1> &a, std::int64_t lda, + sycl::buffer, 1>& a, std::int64_t lda, std::complex beta, - sycl::buffer, 1> &b, std::int64_t ldb, - sycl::buffer, 1> &c, std::int64_t ldc); + sycl::buffer, 1>& b, std::int64_t ldb, + sycl::buffer, 1>& c, std::int64_t ldc); // USM APIs - sycl::event (*row_major_scasum_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies); - sycl::event (*row_major_dzasum_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies); - sycl::event (*row_major_sasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies); - sycl::event (*row_major_dasum_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies); - sycl::event (*row_major_saxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, float *y, + sycl::event (*row_major_scasum_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + float* result, + const std::vector& dependencies); + sycl::event (*row_major_dzasum_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + double* result, + const std::vector& dependencies); + sycl::event (*row_major_sasum_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies); + sycl::event (*row_major_dasum_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies); + sycl::event (*row_major_saxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_daxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, double *y, + const std::vector& dependencies); + sycl::event (*row_major_daxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, double* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_caxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, std::complex *y, + const std::vector& dependencies); + sycl::event (*row_major_caxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zaxpy_usm_sycl)(sycl::queue &queue, std::int64_t n, + const std::vector& dependencies); + sycl::event (*row_major_zaxpy_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*row_major_saxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, float *alpha, const float **x, std::int64_t *incx, - float **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, float* alpha, const float** x, std::int64_t* incx, + float** y, std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_daxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, double *alpha, const double **x, std::int64_t *incx, - double **y, std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, double* alpha, const double** x, std::int64_t* incx, + double** y, std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_caxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_zaxpy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, std::complex *alpha, - const std::complex **x, std::int64_t *incx, std::complex **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, std::complex* alpha, + const std::complex** x, std::int64_t* incx, std::complex** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_saxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, float alpha, const float *x, std::int64_t incx, - std::int64_t stridex, float *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, float alpha, const float* x, std::int64_t incx, + std::int64_t stridex, float* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_daxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, double alpha, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, double alpha, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_caxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, std::complex alpha, const std::complex *x, - std::int64_t incx, std::int64_t stridex, std::complex *y, std::int64_t incy, + sycl::queue& queue, std::int64_t n, std::complex alpha, const std::complex* x, + std::int64_t incx, std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_zaxpy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); - sycl::event (*row_major_saxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - const float *x, std::int64_t incx, const float beta, - float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_daxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - const double *x, std::int64_t incx, const double beta, - double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_caxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::complex alpha, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies); + sycl::event (*row_major_saxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + const float* x, std::int64_t incx, const float beta, + float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_daxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + const double* x, std::int64_t incx, const double beta, + double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_caxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex beta, std::complex *y, + const std::complex* x, std::int64_t incx, + const std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zaxpby_usm_sycl)(sycl::queue &queue, std::int64_t n, + const std::vector& dependencies); + sycl::event (*row_major_zaxpby_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, + const std::complex* x, std::int64_t incx, const std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_scopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_ccopy_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zcopy_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_scopy_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_dcopy_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_ccopy_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_zcopy_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*row_major_scopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const float **x, std::int64_t *incx, float **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const float** x, std::int64_t* incx, float** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_dcopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const double **x, std::int64_t *incx, double **y, - std::int64_t *incy, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const double** x, std::int64_t* incx, double** y, + std::int64_t* incy, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_ccopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_zcopy_batch_group_usm_sycl)( - sycl::queue &queue, std::int64_t *n, const std::complex **x, std::int64_t *incx, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t* n, const std::complex** x, std::int64_t* incx, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_scopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const float *x, std::int64_t incx, std::int64_t stridex, - float *y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const float* x, std::int64_t incx, std::int64_t stridex, + float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*row_major_dcopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const double *x, std::int64_t incx, - std::int64_t stridex, double *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const double* x, std::int64_t incx, + std::int64_t stridex, double* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_ccopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, std::int64_t n, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_zcopy_batch_strided_usm_sycl)( - sycl::queue &queue, std::int64_t n, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); - sycl::event (*row_major_sdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *result, - const std::vector &dependencies); - sycl::event (*row_major_ddot_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *result, - const std::vector &dependencies); - sycl::event (*row_major_dsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - double *result, - const std::vector &dependencies); - sycl::event (*row_major_cdotc_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*row_major_zdotc_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*row_major_cdotu_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*row_major_zdotu_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *result, - const std::vector &dependencies); - sycl::event (*row_major_isamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_idamin_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_icamin_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_izamin_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_isamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_idamax_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_icamax_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_izamax_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - std::int64_t *result, - const std::vector &dependencies); - sycl::event (*row_major_scnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - float *result, - const std::vector &dependencies); - sycl::event (*row_major_dznrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, - const std::complex *x, std::int64_t incx, - double *result, - const std::vector &dependencies); - sycl::event (*row_major_snrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const float *x, - std::int64_t incx, float *result, - const std::vector &dependencies); - sycl::event (*row_major_dnrm2_usm_sycl)(sycl::queue &queue, std::int64_t n, const double *x, - std::int64_t incx, double *result, - const std::vector &dependencies); - sycl::event (*row_major_srot_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, float c, - float s, const std::vector &dependencies); - sycl::event (*row_major_drot_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, double c, - double s, const std::vector &dependencies); - sycl::event (*row_major_csrot_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, float c, - float s, const std::vector &dependencies); - sycl::event (*row_major_zdrot_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, + sycl::queue& queue, std::int64_t n, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); + sycl::event (*row_major_sdot_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* result, + const std::vector& dependencies); + sycl::event (*row_major_ddot_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, + double* result, + const std::vector& dependencies); + sycl::event (*row_major_dsdot_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + double* result, + const std::vector& dependencies); + sycl::event (*row_major_cdotc_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*row_major_zdotc_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*row_major_cdotu_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*row_major_zdotu_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* result, + const std::vector& dependencies); + sycl::event (*row_major_isamin_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_idamin_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_icamin_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_izamin_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_isamax_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_idamax_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_icamax_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_izamax_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + std::int64_t* result, + const std::vector& dependencies); + sycl::event (*row_major_scnrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + float* result, + const std::vector& dependencies); + sycl::event (*row_major_dznrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, + const std::complex* x, std::int64_t incx, + double* result, + const std::vector& dependencies); + sycl::event (*row_major_snrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, const float* x, + std::int64_t incx, float* result, + const std::vector& dependencies); + sycl::event (*row_major_dnrm2_usm_sycl)(sycl::queue& queue, std::int64_t n, const double* x, + std::int64_t incx, double* result, + const std::vector& dependencies); + sycl::event (*row_major_srot_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, float c, + float s, const std::vector& dependencies); + sycl::event (*row_major_drot_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, double c, + double s, const std::vector& dependencies); + sycl::event (*row_major_csrot_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, float c, + float s, const std::vector& dependencies); + sycl::event (*row_major_zdrot_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, double c, double s, - const std::vector &dependencies); - sycl::event (*row_major_srotg_usm_sycl)(sycl::queue &queue, float *a, float *b, float *c, - float *s, const std::vector &dependencies); - sycl::event (*row_major_drotg_usm_sycl)(sycl::queue &queue, double *a, double *b, double *c, - double *s, - const std::vector &dependencies); - sycl::event (*row_major_crotg_usm_sycl)(sycl::queue &queue, std::complex *a, - std::complex *b, float *c, - std::complex *s, - const std::vector &dependencies); - sycl::event (*row_major_zrotg_usm_sycl)(sycl::queue &queue, std::complex *a, - std::complex *b, double *c, - std::complex *s, - const std::vector &dependencies); - sycl::event (*row_major_srotm_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - float *param, - const std::vector &dependencies); - sycl::event (*row_major_drotm_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - double *param, - const std::vector &dependencies); - sycl::event (*row_major_srotmg_usm_sycl)(sycl::queue &queue, float *d1, float *d2, float *x1, - float y1, float *param, - const std::vector &dependencies); - sycl::event (*row_major_drotmg_usm_sycl)(sycl::queue &queue, double *d1, double *d2, double *x1, - double y1, double *param, - const std::vector &dependencies); - sycl::event (*row_major_sscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_cscal_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, + const std::vector& dependencies); + sycl::event (*row_major_srotg_usm_sycl)(sycl::queue& queue, float* a, float* b, float* c, + float* s, const std::vector& dependencies); + sycl::event (*row_major_drotg_usm_sycl)(sycl::queue& queue, double* a, double* b, double* c, + double* s, + const std::vector& dependencies); + sycl::event (*row_major_crotg_usm_sycl)(sycl::queue& queue, std::complex* a, + std::complex* b, float* c, + std::complex* s, + const std::vector& dependencies); + sycl::event (*row_major_zrotg_usm_sycl)(sycl::queue& queue, std::complex* a, + std::complex* b, double* c, + std::complex* s, + const std::vector& dependencies); + sycl::event (*row_major_srotm_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + float* param, + const std::vector& dependencies); + sycl::event (*row_major_drotm_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + double* param, + const std::vector& dependencies); + sycl::event (*row_major_srotmg_usm_sycl)(sycl::queue& queue, float* d1, float* d2, float* x1, + float y1, float* param, + const std::vector& dependencies); + sycl::event (*row_major_drotmg_usm_sycl)(sycl::queue& queue, double* d1, double* d2, double* x1, + double y1, double* param, + const std::vector& dependencies); + sycl::event (*row_major_sscal_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_dscal_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_cscal_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_csscal_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex alpha, std::complex *x, + const std::vector& dependencies); + sycl::event (*row_major_csscal_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex alpha, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_zscal_usm_sycl)(sycl::queue &queue, std::int64_t n, float alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_zdscal_usm_sycl)(sycl::queue &queue, std::int64_t n, double alpha, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_sdsdot_usm_sycl)(sycl::queue &queue, std::int64_t n, float sb, - const float *x, std::int64_t incx, const float *y, - std::int64_t incy, float *result, - const std::vector &dependencies); - sycl::event (*row_major_sswap_usm_sycl)(sycl::queue &queue, std::int64_t n, float *x, - std::int64_t incx, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dswap_usm_sycl)(sycl::queue &queue, std::int64_t n, double *x, - std::int64_t incx, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_cswap_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zswap_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *x, std::int64_t incx, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_sgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_zscal_usm_sycl)(sycl::queue& queue, std::int64_t n, float alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_zdscal_usm_sycl)(sycl::queue& queue, std::int64_t n, double alpha, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_sdsdot_usm_sycl)(sycl::queue& queue, std::int64_t n, float sb, + const float* x, std::int64_t incx, const float* y, + std::int64_t incy, float* result, + const std::vector& dependencies); + sycl::event (*row_major_sswap_usm_sycl)(sycl::queue& queue, std::int64_t n, float* x, + std::int64_t incx, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_dswap_usm_sycl)(sycl::queue& queue, std::int64_t n, double* x, + std::int64_t incx, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_cswap_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_zswap_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* x, std::int64_t incx, + std::complex* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_sgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t ku, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_dgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, - std::int64_t ku, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies); + std::int64_t ku, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, + double beta, double* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*row_major_cgbmv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); - sycl::event (*row_major_zgbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::int64_t kl, std::int64_t ku, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); + sycl::event (*row_major_zgbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t kl, std::int64_t ku, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_sgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_sgemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, + const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_dgemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_cgemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_cgemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*row_major_zgemv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); sycl::event (*row_major_sgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stridea, const float *x, - std::int64_t incx, std::int64_t stridex, float beta, float *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stridea, const float* x, + std::int64_t incx, std::int64_t stridex, float beta, float* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_dgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stridea, const double *x, - std::int64_t incx, std::int64_t stridex, double beta, double *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stridea, const double* x, + std::int64_t incx, std::int64_t stridex, double beta, double* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_cgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex beta, std::complex *y, std::int64_t incy, std::int64_t stridey, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_zgemv_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stridea, const std::complex *x, std::int64_t incx, - std::int64_t stridex, std::complex beta, std::complex *y, std::int64_t incy, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stridea, const std::complex* x, std::int64_t incx, + std::int64_t stridex, std::complex beta, std::complex* y, std::int64_t incy, std::int64_t stridey, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_sgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, - float *beta, float **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, const float** x, std::int64_t* incx, + float* beta, float** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_dgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, - double *beta, double **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, const double** x, std::int64_t* incx, + double* beta, double** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_cgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_zgemv_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - const std::complex **x, std::int64_t *incx, std::complex *beta, - std::complex **y, std::int64_t *incy, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + const std::complex** x, std::int64_t* incx, std::complex* beta, + std::complex** y, std::int64_t* incy, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_sdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const float *a, std::int64_t lda, std::int64_t stridea, const float *x, std::int64_t incx, - std::int64_t stridex, float *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const float* a, std::int64_t lda, std::int64_t stridea, const float* x, std::int64_t incx, + std::int64_t stridex, float* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_ddgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const double *a, std::int64_t lda, std::int64_t stridea, const double *x, std::int64_t incx, - std::int64_t stridex, double *c, std::int64_t ldc, std::int64_t stridec, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const double* a, std::int64_t lda, std::int64_t stridea, const double* x, std::int64_t incx, + std::int64_t stridex, double* c, std::int64_t ldc, std::int64_t stridec, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_cdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*row_major_zdgmm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, - const std::complex *a, std::int64_t lda, std::int64_t stridea, - const std::complex *x, std::int64_t incx, std::int64_t stridex, - std::complex *c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side left_right, std::int64_t m, std::int64_t n, + const std::complex* a, std::int64_t lda, std::int64_t stridea, + const std::complex* x, std::int64_t incx, std::int64_t stridex, + std::complex* c, std::int64_t ldc, std::int64_t stridec, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*row_major_sdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const float **a, std::int64_t *lda, const float **x, std::int64_t *incx, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const float** a, std::int64_t* lda, const float** x, std::int64_t* incx, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_ddgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const double **a, std::int64_t *lda, const double **x, std::int64_t *incx, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const double** a, std::int64_t* lda, const double** x, std::int64_t* incx, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_cdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_zdgmm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, std::int64_t *m, std::int64_t *n, - const std::complex **a, std::int64_t *lda, const std::complex **x, - std::int64_t *incx, std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); - sycl::event (*row_major_sger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - float alpha, const float *x, std::int64_t incx, - const float *y, std::int64_t incy, float *a, + sycl::queue& queue, oneapi::mkl::side* left_right, std::int64_t* m, std::int64_t* n, + const std::complex** a, std::int64_t* lda, const std::complex** x, + std::int64_t* incx, std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); + sycl::event (*row_major_sger_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + float alpha, const float* x, std::int64_t incx, + const float* y, std::int64_t incy, float* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_dger_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - double alpha, const double *x, std::int64_t incx, - const double *y, std::int64_t incy, double *a, + const std::vector& dependencies); + sycl::event (*row_major_dger_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + double alpha, const double* x, std::int64_t incx, + const double* y, std::int64_t incy, double* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_cgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, + const std::vector& dependencies); + sycl::event (*row_major_cgerc_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_zgerc_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::vector& dependencies); + sycl::event (*row_major_zgerc_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_cgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *x, - std::int64_t incx, const std::complex *y, - std::int64_t incy, std::complex *a, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_cgeru_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* x, + std::int64_t incx, const std::complex* y, + std::int64_t incy, std::complex* a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_zgeru_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + const std::vector& dependencies); + sycl::event (*row_major_zgeru_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_chbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_chbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *x, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, - const std::vector &dependencies); + std::complex* y, std::int64_t incy, + const std::vector& dependencies); sycl::event (*row_major_zhbmv_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, std::complex beta, - std::complex *y, std::int64_t incy, const std::vector &dependencies); - sycl::event (*row_major_chemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, + std::complex alpha, const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, std::complex beta, + std::complex* y, std::int64_t incy, const std::vector& dependencies); + sycl::event (*row_major_chemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zhemv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_zhemv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, std::int64_t lda, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_cher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_cher_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_zher_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_zher_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_cher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_cher2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_zher2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_zher2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_chpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_chpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_zhpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_zhpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *a, - const std::complex *x, std::int64_t incx, - std::complex beta, std::complex *y, + const std::complex* a, + const std::complex* x, std::int64_t incx, + std::complex beta, std::complex* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_chpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_chpr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, float alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies); - sycl::event (*row_major_zhpr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies); + sycl::event (*row_major_zhpr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, double alpha, - const std::complex *x, std::int64_t incx, - std::complex *a, - const std::vector &dependencies); - sycl::event (*row_major_chpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + std::complex* a, + const std::vector& dependencies); + sycl::event (*row_major_chpr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies); - sycl::event (*row_major_zhpr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies); + sycl::event (*row_major_zhpr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::complex alpha, - const std::complex *x, std::int64_t incx, - const std::complex *y, std::int64_t incy, - std::complex *a, - const std::vector &dependencies); - sycl::event (*row_major_ssbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* x, std::int64_t incx, + const std::complex* y, std::int64_t incy, + std::complex* a, + const std::vector& dependencies); + sycl::event (*row_major_ssbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *x, - std::int64_t incx, float beta, float *y, + const float* a, std::int64_t lda, const float* x, + std::int64_t incx, float beta, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dsbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_dsbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *x, - std::int64_t incx, double beta, double *y, + const double* a, std::int64_t lda, const double* x, + std::int64_t incx, double beta, double* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_sspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, - const float *x, std::int64_t incx, float beta, float *y, + const std::vector& dependencies); + sycl::event (*row_major_sspmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* a, + const float* x, std::int64_t incx, float beta, float* y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dspmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, - const double *x, std::int64_t incx, double beta, - double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_sspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, - const std::vector &dependencies); - sycl::event (*row_major_dspr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, - const std::vector &dependencies); - sycl::event (*row_major_sspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *a, const std::vector &dependencies); - sycl::event (*row_major_dspr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *a, - const std::vector &dependencies); - sycl::event (*row_major_ssymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *x, std::int64_t incx, - float beta, float *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_dsymv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *x, std::int64_t incx, - double beta, double *y, std::int64_t incy, - const std::vector &dependencies); - sycl::event (*row_major_ssyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, float *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_dsyr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, double *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_ssyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, float alpha, const float *x, - std::int64_t incx, const float *y, std::int64_t incy, - float *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_dsyr2_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, - std::int64_t n, double alpha, const double *x, - std::int64_t incx, const double *y, std::int64_t incy, - double *a, std::int64_t lda, - const std::vector &dependencies); - sycl::event (*row_major_stbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_dspmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* a, + const double* x, std::int64_t incx, double beta, + double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_sspr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, + const std::vector& dependencies); + sycl::event (*row_major_dspr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, + const std::vector& dependencies); + sycl::event (*row_major_sspr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* a, const std::vector& dependencies); + sycl::event (*row_major_dspr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, + double* a, + const std::vector& dependencies); + sycl::event (*row_major_ssymv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* x, std::int64_t incx, + float beta, float* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_dsymv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* x, std::int64_t incx, + double beta, double* y, std::int64_t incy, + const std::vector& dependencies); + sycl::event (*row_major_ssyr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, float* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_dsyr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, double* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_ssyr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, float alpha, const float* x, + std::int64_t incx, const float* y, std::int64_t incy, + float* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_dsyr2_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, + std::int64_t n, double alpha, const double* x, + std::int64_t incx, const double* y, std::int64_t incy, + double* a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*row_major_stbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_dtbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ctbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztbmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ztbmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_stbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_stbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const float *a, std::int64_t lda, - float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const float* a, std::int64_t lda, + float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_dtbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const double *a, std::int64_t lda, - double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, const double* a, std::int64_t lda, + double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ctbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztbsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ztbsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - std::int64_t k, const std::complex *a, - std::int64_t lda, std::complex *x, + std::int64_t k, const std::complex* a, + std::int64_t lda, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_stpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_stpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_dtpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ctpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztpmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ztpmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_stpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_stpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, float *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const float* a, float* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_dtpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, double *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const double* a, double* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ctpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztpsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ztpsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::complex *x, + const std::complex* a, std::complex* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_strmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_strmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, + const float* a, std::int64_t lda, float* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_dtrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, + const double* a, std::int64_t lda, double* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ctrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztrmv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ztrmv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_strsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_strsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const float *a, std::int64_t lda, float *x, + const float* a, std::int64_t lda, float* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_dtrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_dtrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const double *a, std::int64_t lda, double *x, + const double* a, std::int64_t lda, double* x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ctrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ctrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_ztrsv_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_ztrsv_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t n, - const std::complex *a, std::int64_t lda, - std::complex *x, std::int64_t incx, - const std::vector &dependencies); - sycl::event (*row_major_sgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::complex* a, std::int64_t lda, + std::complex* x, std::int64_t incx, + const std::vector& dependencies); + sycl::event (*row_major_sgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const float *a, std::int64_t lda, const float *b, - std::int64_t ldb, float beta, float *c, + const float* a, std::int64_t lda, const float* b, + std::int64_t ldb, float beta, float* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_dgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::vector& dependencies); + sycl::event (*row_major_dgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, double alpha, - const double *a, std::int64_t lda, const double *b, - std::int64_t ldb, double beta, double *c, + const double* a, std::int64_t lda, const double* b, + std::int64_t ldb, double beta, double* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_cgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::vector& dependencies); + sycl::event (*row_major_cgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, + std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_zgemm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_hgemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_hgemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, - const sycl::half *a, std::int64_t lda, - const sycl::half *b, std::int64_t ldb, sycl::half beta, - sycl::half *c, std::int64_t ldc, - const std::vector &dependencies); + const sycl::half* a, std::int64_t lda, + const sycl::half* b, std::int64_t ldb, sycl::half beta, + sycl::half* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_gemm_f16f16f32_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, const sycl::half *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, const sycl::half* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies); sycl::event (*row_major_gemm_bf16bf16f32_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16 *a, - std::int64_t lda, const oneapi::mkl::bfloat16 *b, std::int64_t ldb, float beta, float *c, - std::int64_t ldc, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const oneapi::mkl::bfloat16* a, + std::int64_t lda, const oneapi::mkl::bfloat16* b, std::int64_t ldb, float beta, float* c, + std::int64_t ldc, const std::vector& dependencies); sycl::event (*row_major_chemm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*row_major_zhemm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*row_major_zhemm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_cherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_cherk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, float alpha, - const std::complex *a, std::int64_t lda, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_zherk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_zherk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, double alpha, - const std::complex *a, std::int64_t lda, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_cher2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_cher2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - float beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_zher2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + float beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_zher2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - double beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_ssymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + double beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_ssymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_dsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_dsymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_csymm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*row_major_zsymm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*row_major_zsymm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_ssyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ssyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, float beta, float *c, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, float beta, float* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_dsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_dsyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, double beta, double *c, + std::int64_t k, double alpha, const double* a, + std::int64_t lda, double beta, double* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_csyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_csyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_zsyrk_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_zsyrk_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_ssyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, float *alpha, const float **a, std::int64_t *lda, - float *beta, float **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, float* alpha, const float** a, std::int64_t* lda, + float* beta, float** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_dsyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, double *alpha, const double **a, std::int64_t *lda, - double *beta, double **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, double* alpha, const double** a, std::int64_t* lda, + double* beta, double** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_csyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_zsyrk_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *upper_lower, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, std::complex *beta, - std::complex **c, std::int64_t *ldc, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* upper_lower, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, std::complex* beta, + std::complex** c, std::int64_t* ldc, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_ssyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_dsyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double beta, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_csyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex *c, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_zsyrk_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex *c, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); - sycl::event (*row_major_ssyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + const std::vector& dependencies); + sycl::event (*row_major_ssyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_dsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_dsyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_csyr2k_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t k, std::complex alpha, const std::complex *a, - std::int64_t lda, const std::complex *b, std::int64_t ldb, std::complex beta, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*row_major_zsyr2k_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t k, std::complex alpha, const std::complex* a, + std::int64_t lda, const std::complex* b, std::int64_t ldb, std::complex beta, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*row_major_zsyr2k_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, - const std::complex *b, std::int64_t ldb, - std::complex beta, std::complex *c, + const std::complex* a, std::int64_t lda, + const std::complex* b, std::int64_t ldb, + std::complex beta, std::complex* c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_strmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + const std::vector& dependencies); + sycl::event (*row_major_strmm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_dtrmm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_dtrmm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies); sycl::event (*row_major_ctrmm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*row_major_ztrmm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); - sycl::event (*row_major_strsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); + sycl::event (*row_major_strsm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_dtrsm_usm_sycl)(sycl::queue &queue, oneapi::mkl::side left_right, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_dtrsm_usm_sycl)(sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double *b, std::int64_t ldb, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double* b, std::int64_t ldb, + const std::vector& dependencies); sycl::event (*row_major_ctrsm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*row_major_ztrsm_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, const std::vector& dependencies); sycl::event (*row_major_strsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_dtrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_ctrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_ztrsm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::side left_right, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, oneapi::mkl::diag unit_diag, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_strsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, float *alpha, const float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, float* alpha, const float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_dtrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, double *alpha, const double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, double* alpha, const double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_ctrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_ztrsm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::side *left_right, oneapi::mkl::uplo *upper_lower, - oneapi::mkl::transpose *trans, oneapi::mkl::diag *unit_diag, std::int64_t *m, - std::int64_t *n, std::complex *alpha, const std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::side* left_right, oneapi::mkl::uplo* upper_lower, + oneapi::mkl::transpose* trans, oneapi::mkl::diag* unit_diag, std::int64_t* m, + std::int64_t* n, std::complex* alpha, const std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_size, const std::vector& dependencies); sycl::event (*row_major_sgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const float **a, - std::int64_t *lda, const float **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const float** a, + std::int64_t* lda, const float** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_dgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, double *alpha, const double **a, - std::int64_t *lda, const double **b, std::int64_t *ldb, double *beta, double **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, double* alpha, const double** a, + std::int64_t* lda, const double** b, std::int64_t* ldb, double* beta, double** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_cgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_zgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, std::complex *alpha, - const std::complex **a, std::int64_t *lda, const std::complex **b, - std::int64_t *ldb, std::complex *beta, std::complex **c, std::int64_t *ldc, - std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, std::complex* alpha, + const std::complex** a, std::int64_t* lda, const std::complex** b, + std::int64_t* ldb, std::complex* beta, std::complex** c, std::int64_t* ldc, + std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_hgemm_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, sycl::half *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, sycl::half *beta, - sycl::half **c, std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, sycl::half* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, sycl::half* beta, + sycl::half** c, std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_gemm_f16f16f32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const sycl::half **a, - std::int64_t *lda, const sycl::half **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const sycl::half** a, + std::int64_t* lda, const sycl::half** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_gemm_s8s8f32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, float **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, float** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_gemm_s8s8s32_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *transa, oneapi::mkl::transpose *transb, - std::int64_t *m, std::int64_t *n, std::int64_t *k, float *alpha, const std::int8_t **a, - std::int64_t *lda, const std::int8_t **b, std::int64_t *ldb, float *beta, std::int32_t **c, - std::int64_t *ldc, std::int64_t group_count, std::int64_t *group_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* transa, oneapi::mkl::transpose* transb, + std::int64_t* m, std::int64_t* n, std::int64_t* k, float* alpha, const std::int8_t** a, + std::int64_t* lda, const std::int8_t** b, std::int64_t* ldb, float* beta, std::int32_t** c, + std::int64_t* ldc, std::int64_t group_count, std::int64_t* group_size, + const std::vector& dependencies); sycl::event (*row_major_sgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float *a, - std::int64_t lda, std::int64_t stride_a, const float *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const float* a, + std::int64_t lda, std::int64_t stride_a, const float* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_dgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double *a, - std::int64_t lda, std::int64_t stride_a, const double *b, std::int64_t ldb, - std::int64_t stride_b, double beta, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, double alpha, const double* a, + std::int64_t lda, std::int64_t stride_a, const double* b, std::int64_t ldb, + std::int64_t stride_b, double beta, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_cgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_zgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, std::int64_t stride_a, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex beta, std::complex *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + const std::complex* a, std::int64_t lda, std::int64_t stride_a, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex beta, std::complex* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_hgemm_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, sycl::half beta, sycl::half *c, std::int64_t ldc, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::half alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, sycl::half beta, sycl::half* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_gemm_f16f16f32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half *a, - std::int64_t lda, std::int64_t stride_a, const sycl::half *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const sycl::half* a, + std::int64_t lda, std::int64_t stride_a, const sycl::half* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_gemm_s8s8f32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, float *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, float* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_gemm_s8s8s32_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t *a, - std::int64_t lda, std::int64_t stride_a, const std::int8_t *b, std::int64_t ldb, - std::int64_t stride_b, float beta, std::int32_t *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); - sycl::event (*row_major_sgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::int64_t k, float alpha, const std::int8_t* a, + std::int64_t lda, std::int64_t stride_a, const std::int8_t* b, std::int64_t ldb, + std::int64_t stride_b, float beta, std::int32_t* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); + sycl::event (*row_major_sgemmt_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, - std::int64_t k, float alpha, const float *a, - std::int64_t lda, const float *b, std::int64_t ldb, - float beta, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_dgemmt_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo upper_lower, + std::int64_t k, float alpha, const float* a, + std::int64_t lda, const float* b, std::int64_t ldb, + float beta, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_dgemmt_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, - std::int64_t k, double alpha, const double *a, - std::int64_t lda, const double *b, std::int64_t ldb, - double beta, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t k, double alpha, const double* a, + std::int64_t lda, const double* b, std::int64_t ldb, + double beta, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_cgemmt_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_zgemmt_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t n, std::int64_t k, std::complex alpha, - const std::complex *a, std::int64_t lda, const std::complex *b, - std::int64_t ldb, std::complex beta, std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* a, std::int64_t lda, const std::complex* b, + std::int64_t ldb, std::complex beta, std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_gemm_s8u8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*row_major_gemm_s8s8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::int8_t *a, std::int64_t lda, std::int8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::int8_t* a, std::int64_t lda, std::int8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*row_major_gemm_u8s8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::int8_t *b, - std::int64_t ldb, std::int8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, const std::int8_t* b, + std::int64_t ldb, std::int8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*row_major_gemm_u8u8s32_bias_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, oneapi::mkl::offset offsetc, std::int64_t m, std::int64_t n, std::int64_t k, float alpha, - const std::uint8_t *a, std::int64_t lda, std::uint8_t ao, const std::uint8_t *b, - std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t *c, std::int64_t ldc, - const std::int32_t *co, const std::vector &dependencies); + const std::uint8_t* a, std::int64_t lda, std::uint8_t ao, const std::uint8_t* b, + std::int64_t ldb, std::uint8_t bo, float beta, std::int32_t* c, std::int64_t ldc, + const std::int32_t* co, const std::vector& dependencies); sycl::event (*row_major_somatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, const float *a, std::int64_t lda, std::int64_t stride_a, float *b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, const float* a, std::int64_t lda, std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_domatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, const double *a, std::int64_t lda, std::int64_t stride_a, double *b, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, const double* a, std::int64_t lda, std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*row_major_comatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_zomatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, const std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, const std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_simatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - float alpha, float *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + float alpha, float* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_dimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - double alpha, double *ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + double alpha, double* ab, std::int64_t lda, std::int64_t ldb, std::int64_t stride, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_cimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, std::int64_t lda, std::int64_t ldb, - std::int64_t stride, std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_zimatcopy_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, std::int64_t lda, std::int64_t ldb, - std::int64_t stride, std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, + std::int64_t stride, std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_somatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, float alpha, const float *a, std::int64_t lda, - std::int64_t stride_a, float beta, const float *b, std::int64_t ldb, std::int64_t stride_b, - float *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, float alpha, const float* a, std::int64_t lda, + std::int64_t stride_a, float beta, const float* b, std::int64_t ldb, std::int64_t stride_b, + float* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*row_major_domatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, double alpha, const double *a, std::int64_t lda, - std::int64_t stride_a, double beta, const double *b, std::int64_t ldb, - std::int64_t stride_b, double *c, std::int64_t ldc, std::int64_t stride_c, - std::int64_t batch_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, double alpha, const double* a, std::int64_t lda, + std::int64_t stride_a, double beta, const double* b, std::int64_t ldb, + std::int64_t stride_b, double* c, std::int64_t ldc, std::int64_t stride_c, + std::int64_t batch_size, const std::vector& dependencies); sycl::event (*row_major_comatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); sycl::event (*row_major_zomatadd_batch_strided_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, std::int64_t lda, std::int64_t stride_a, std::complex beta, - const std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::complex *c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, - const std::vector &dependencies); - sycl::event (*row_major_somatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::complex* c, std::int64_t ldc, std::int64_t stride_c, std::int64_t batch_size, + const std::vector& dependencies); + sycl::event (*row_major_somatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, float *b, + const float* a, std::int64_t lda, float* b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_domatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_domatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, double *b, + const double* a, std::int64_t lda, double* b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_comatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_comatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_zomatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_zomatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_somatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_somatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - const float *a, std::int64_t lda, - std::int64_t stridea, float *b, std::int64_t ldb, + const float* a, std::int64_t lda, + std::int64_t stridea, float* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*row_major_domatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_domatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - const double *a, std::int64_t lda, - std::int64_t stridea, double *b, std::int64_t ldb, + const double* a, std::int64_t lda, + std::int64_t stridea, double* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*row_major_comatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_comatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*row_major_zomatcopy2_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_zomatcopy2_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, - std::int64_t stridea, std::complex *b, + const std::complex* a, std::int64_t lda, + std::int64_t stridea, std::complex* b, std::int64_t ldb, std::int64_t strideb, - const std::vector &dependencies); - sycl::event (*row_major_simatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_simatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float alpha, - float *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_dimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + float* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_dimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double alpha, - double *ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_cimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + double* ab, std::int64_t lda, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*row_major_cimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex alpha, std::complex *ab, + std::complex alpha, std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_zimatcopy_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + const std::vector& dependencies); + sycl::event (*row_major_zimatcopy_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::complex alpha, - std::complex *ab, std::int64_t lda, + std::complex* ab, std::int64_t lda, std::int64_t ldb, - const std::vector &dependencies); - sycl::event (*row_major_somatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + const std::vector& dependencies); + sycl::event (*row_major_somatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, float alpha, const float *a, - std::int64_t lda, float beta, const float *b, - std::int64_t ldb, float *c, std::int64_t ldc, - const std::vector &dependencies); - sycl::event (*row_major_domatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + std::int64_t n, float alpha, const float* a, + std::int64_t lda, float beta, const float* b, + std::int64_t ldb, float* c, std::int64_t ldc, + const std::vector& dependencies); + sycl::event (*row_major_domatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, - std::int64_t n, double alpha, const double *a, - std::int64_t lda, double beta, const double *b, - std::int64_t ldb, double *c, std::int64_t ldc, - const std::vector &dependencies); + std::int64_t n, double alpha, const double* a, + std::int64_t lda, double beta, const double* b, + std::int64_t ldb, double* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_comatadd_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, - std::int64_t m, std::int64_t n, std::complex alpha, const std::complex *a, - std::int64_t lda, std::complex beta, const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, const std::vector &dependencies); - sycl::event (*row_major_zomatadd_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose transa, + sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, + std::int64_t m, std::int64_t n, std::complex alpha, const std::complex* a, + std::int64_t lda, std::complex beta, const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, const std::vector& dependencies); + sycl::event (*row_major_zomatadd_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, std::int64_t m, std::int64_t n, std::complex alpha, - const std::complex *a, std::int64_t lda, + const std::complex* a, std::int64_t lda, std::complex beta, - const std::complex *b, std::int64_t ldb, - std::complex *c, std::int64_t ldc, - const std::vector &dependencies); + const std::complex* b, std::int64_t ldb, + std::complex* c, std::int64_t ldc, + const std::vector& dependencies); sycl::event (*row_major_somatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, const float **a, std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, const float** a, std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*row_major_domatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, const double **a, std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, const double** a, std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*row_major_comatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*row_major_zomatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, const std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, const std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*row_major_simatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - float *alpha, float **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + float* alpha, float** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*row_major_dimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - double *alpha, double **ab, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *groupsize, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + double* alpha, double** ab, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* groupsize, const std::vector& dependencies); sycl::event (*row_major_cimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, std::complex **ab, std::int64_t *lda, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, std::complex** ab, std::int64_t* lda, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); sycl::event (*row_major_zimatcopy_batch_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *m, std::int64_t *n, - std::complex *alpha, std::complex **ab, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *groupsize, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* m, std::int64_t* n, + std::complex* alpha, std::complex** ab, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* groupsize, + const std::vector& dependencies); } blas_function_table_t; diff --git a/src/dft/backends/backend_backward_instantiations.cxx b/src/dft/backends/backend_backward_instantiations.cxx index a6aeaf71b..e4d960afb 100644 --- a/src/dft/backends/backend_backward_instantiations.cxx +++ b/src/dft/backends/backend_backward_instantiations.cxx @@ -25,29 +25,29 @@ using desc_rd_t = dft::detail::descriptor; using desc_cd_t = dft::detail::descriptor; -using depends_vec_t = const std::vector &; +using depends_vec_t = const std::vector&; -#define ONEMKL_DFT_BACKWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T) \ - /* Buffer API */ \ - template ONEMKL_EXPORT void compute_backward(DESCRIPTOR_T &, \ - sycl::buffer &); \ - template ONEMKL_EXPORT void compute_backward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &); \ - template ONEMKL_EXPORT void compute_backward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &); \ - template ONEMKL_EXPORT void compute_backward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &, \ - sycl::buffer &, sycl::buffer &); \ - \ - /* USM API */ \ - template ONEMKL_EXPORT sycl::event compute_backward(DESCRIPTOR_T &, FORWARD_T *, \ - depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_backward(DESCRIPTOR_T &, SCALAR_T *, \ - SCALAR_T *, depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_backward( \ - DESCRIPTOR_T &, BACKWARD_T *, FORWARD_T *, depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_backward( \ - DESCRIPTOR_T &, SCALAR_T *, SCALAR_T *, SCALAR_T *, SCALAR_T *, depends_vec_t); +#define ONEMKL_DFT_BACKWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T) \ + /* Buffer API */ \ + template ONEMKL_EXPORT void compute_backward(DESCRIPTOR_T&, \ + sycl::buffer&); \ + template ONEMKL_EXPORT void compute_backward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&); \ + template ONEMKL_EXPORT void compute_backward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&); \ + template ONEMKL_EXPORT void compute_backward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&, sycl::buffer&, \ + sycl::buffer&); \ + \ + /* USM API */ \ + template ONEMKL_EXPORT sycl::event compute_backward(DESCRIPTOR_T&, FORWARD_T*, \ + depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_backward(DESCRIPTOR_T&, SCALAR_T*, \ + SCALAR_T*, depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_backward(DESCRIPTOR_T&, BACKWARD_T*, \ + FORWARD_T*, depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_backward( \ + DESCRIPTOR_T&, SCALAR_T*, SCALAR_T*, SCALAR_T*, SCALAR_T*, depends_vec_t); ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_rf_t, float, float, std::complex) ONEMKL_DFT_BACKWARD_INSTANTIATIONS(desc_cf_t, float, std::complex, std::complex) diff --git a/src/dft/backends/backend_forward_instantiations.cxx b/src/dft/backends/backend_forward_instantiations.cxx index a6ed371d5..b23a5ca40 100644 --- a/src/dft/backends/backend_forward_instantiations.cxx +++ b/src/dft/backends/backend_forward_instantiations.cxx @@ -25,29 +25,29 @@ using desc_rd_t = dft::detail::descriptor; using desc_cd_t = dft::detail::descriptor; -using depends_vec_t = const std::vector &; +using depends_vec_t = const std::vector&; -#define ONEMKL_DFT_FORWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T) \ - /* Buffer API */ \ - template ONEMKL_EXPORT void compute_forward(DESCRIPTOR_T &, \ - sycl::buffer &); \ - template ONEMKL_EXPORT void compute_forward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &); \ - template ONEMKL_EXPORT void compute_forward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &); \ - template ONEMKL_EXPORT void compute_forward( \ - DESCRIPTOR_T &, sycl::buffer &, sycl::buffer &, \ - sycl::buffer &, sycl::buffer &); \ - \ - /* USM API */ \ - template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T &, FORWARD_T *, \ - depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T &, SCALAR_T *, \ - SCALAR_T *, depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T &, FORWARD_T *, \ - BACKWARD_T *, depends_vec_t); \ - template ONEMKL_EXPORT sycl::event compute_forward( \ - DESCRIPTOR_T &, SCALAR_T *, SCALAR_T *, SCALAR_T *, SCALAR_T *, depends_vec_t); +#define ONEMKL_DFT_FORWARD_INSTANTIATIONS(DESCRIPTOR_T, SCALAR_T, FORWARD_T, BACKWARD_T) \ + /* Buffer API */ \ + template ONEMKL_EXPORT void compute_forward(DESCRIPTOR_T&, \ + sycl::buffer&); \ + template ONEMKL_EXPORT void compute_forward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&); \ + template ONEMKL_EXPORT void compute_forward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&); \ + template ONEMKL_EXPORT void compute_forward( \ + DESCRIPTOR_T&, sycl::buffer&, sycl::buffer&, sycl::buffer&, \ + sycl::buffer&); \ + \ + /* USM API */ \ + template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T&, FORWARD_T*, \ + depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T&, SCALAR_T*, \ + SCALAR_T*, depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_forward(DESCRIPTOR_T&, FORWARD_T*, \ + BACKWARD_T*, depends_vec_t); \ + template ONEMKL_EXPORT sycl::event compute_forward( \ + DESCRIPTOR_T&, SCALAR_T*, SCALAR_T*, SCALAR_T*, SCALAR_T*, depends_vec_t); ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_rf_t, float, float, std::complex) ONEMKL_DFT_FORWARD_INSTANTIATIONS(desc_cf_t, float, std::complex, std::complex) diff --git a/src/dft/backends/cufft/backward.cpp b/src/dft/backends/cufft/backward.cpp index aea9f232f..e4ff1594f 100644 --- a/src/dft/backends/cufft/backward.cpp +++ b/src/dft/backends/cufft/backward.cpp @@ -37,19 +37,19 @@ namespace oneapi::mkl::dft::cufft { namespace detail { //forward declaration template -std::array get_offsets_bwd(dft::detail::commit_impl *commit); +std::array get_offsets_bwd(dft::detail::commit_impl* commit); template -cufftHandle get_bwd_plan(dft::detail::commit_impl *commit) { - return static_cast *>(commit->get_handle())[1].value(); +cufftHandle get_bwd_plan(dft::detail::commit_impl* commit) { + return static_cast*>(commit->get_handle())[1].value(); } } // namespace detail // BUFFER version //In-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout) { const std::string func_name = "compute_backward(desc, inout)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -67,35 +67,35 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, } } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_acc = inout.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, plan); - auto inout_native = reinterpret_cast *>( + auto inout_native = reinterpret_cast*>( ih.get_native_mem(inout_acc)); detail::cufft_execute>( - func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), - reinterpret_cast(inout_native + offsets[1])); + func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), + reinterpret_cast(inout_native + offsets[1])); }); }); } //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &, sycl::buffer, 1> &, - sycl::buffer, 1> &) { +ONEMKL_EXPORT void compute_backward(descriptor_type&, sycl::buffer, 1>&, + sycl::buffer, 1>&) { throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, inout_re, inout_im)", "cuFFT does not support real-real complex storage."); } //Out-of-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { const std::string func_name = "compute_backward(desc, in, out)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -112,7 +112,7 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, } } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto in_acc = in.template get_access(cgh); auto out_acc = out.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh); @@ -120,12 +120,12 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, plan); - auto in_native = reinterpret_cast( - reinterpret_cast *>( + auto in_native = reinterpret_cast( + reinterpret_cast*>( ih.get_native_mem(in_acc)) + offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast *>( + auto out_native = reinterpret_cast( + reinterpret_cast*>( ih.get_native_mem(out_acc)) + offsets[1]); detail::cufft_execute>( @@ -136,10 +136,10 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &, sycl::buffer, 1> &, - sycl::buffer, 1> &, - sycl::buffer, 1> &, - sycl::buffer, 1> &) { +ONEMKL_EXPORT void compute_backward(descriptor_type&, sycl::buffer, 1>&, + sycl::buffer, 1>&, + sycl::buffer, 1>&, + sycl::buffer, 1>&) { throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, in_re, in_im, out_re, out_im)", "cuFFT does not support real-real complex storage."); } @@ -148,8 +148,8 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &, sycl::buffer -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { const std::string func_name = "compute_backward(desc, inout, dependencies)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -167,7 +167,7 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -184,9 +184,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &, scalar *, - scalar *, - const std::vector &) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type&, scalar*, + scalar*, + const std::vector&) { throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, inout_re, inout_im, dependencies)", "cuFFT does not support real-real complex storage."); @@ -194,9 +194,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &, scalar -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd *in, - fwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, bwd* in, + fwd* out, + const std::vector& dependencies) { const std::string func_name = "compute_backward(desc, in, out, dependencies)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -213,7 +213,7 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -230,10 +230,10 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &, scalar *, - scalar *, scalar *, - scalar *, - const std::vector &) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type&, scalar*, + scalar*, scalar*, + scalar*, + const std::vector&) { throw oneapi::mkl::unimplemented("DFT", "compute_backward(desc, in_re, in_im, out_re, out_im, deps)", "cuFFT does not support real-real complex storage."); diff --git a/src/dft/backends/cufft/execute_helper.hpp b/src/dft/backends/cufft/execute_helper.hpp index 776f0f254..67e70b005 100644 --- a/src/dft/backends/cufft/execute_helper.hpp +++ b/src/dft/backends/cufft/execute_helper.hpp @@ -37,8 +37,8 @@ namespace oneapi::mkl::dft::cufft::detail { template -inline dft::detail::commit_impl *checked_get_commit( - dft::detail::descriptor &desc) { +inline dft::detail::commit_impl* checked_get_commit( + dft::detail::descriptor& desc) { auto commit_handle = dft::detail::get_commit(desc); if (commit_handle == nullptr || commit_handle->get_backend() != backend::cufft) { throw mkl::invalid_argument("dft/backends/cufft", "get_commit", @@ -50,7 +50,7 @@ inline dft::detail::commit_impl *checked_get_commit( /// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match /// the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -61,8 +61,8 @@ inline auto expect_config(DescT &desc, const char *message) { enum class Direction { Forward = CUFFT_FORWARD, Backward = CUFFT_INVERSE }; template -void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, void *input, - void *output) { +void cufft_execute(const std::string& func, CUstream stream, cufftHandle plan, void* input, + void* output) { constexpr bool is_real = std::is_floating_point_v; using single_type = std::conditional_t>; constexpr bool is_single = std::is_same_v; @@ -70,16 +70,16 @@ void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, v if constexpr (is_real) { if constexpr (dir == Direction::Forward) { if constexpr (is_single) { - auto result = cufftExecR2C(plan, reinterpret_cast(input), - reinterpret_cast(output)); + auto result = cufftExecR2C(plan, reinterpret_cast(input), + reinterpret_cast(output)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecR2C returned " + std::to_string(result)); } } else { - auto result = cufftExecD2Z(plan, reinterpret_cast(input), - reinterpret_cast(output)); + auto result = cufftExecD2Z(plan, reinterpret_cast(input), + reinterpret_cast(output)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecD2Z returned " + std::to_string(result)); @@ -88,16 +88,16 @@ void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, v } else { if constexpr (is_single) { - auto result = cufftExecC2R(plan, reinterpret_cast(input), - reinterpret_cast(output)); + auto result = cufftExecC2R(plan, reinterpret_cast(input), + reinterpret_cast(output)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecC2R returned " + std::to_string(result)); } } else { - auto result = cufftExecZ2D(plan, reinterpret_cast(input), - reinterpret_cast(output)); + auto result = cufftExecZ2D(plan, reinterpret_cast(input), + reinterpret_cast(output)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecZ2D returned " + std::to_string(result)); @@ -108,8 +108,8 @@ void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, v else { if constexpr (is_single) { auto result = - cufftExecC2C(plan, reinterpret_cast(input), - reinterpret_cast(output), static_cast(dir)); + cufftExecC2C(plan, reinterpret_cast(input), + reinterpret_cast(output), static_cast(dir)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecC2C returned " + std::to_string(result)); @@ -117,8 +117,8 @@ void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, v } else { auto result = - cufftExecZ2Z(plan, reinterpret_cast(input), - reinterpret_cast(output), static_cast(dir)); + cufftExecZ2Z(plan, reinterpret_cast(input), + reinterpret_cast(output), static_cast(dir)); if (result != CUFFT_SUCCESS) { throw oneapi::mkl::exception("dft/backends/cufft", func, "cufftExecZ2Z returned " + std::to_string(result)); @@ -133,7 +133,7 @@ void cufft_execute(const std::string &func, CUstream stream, cufftHandle plan, v } } -inline CUstream setup_stream(const std::string &func, sycl::interop_handle ih, cufftHandle plan) { +inline CUstream setup_stream(const std::string& func, sycl::interop_handle ih, cufftHandle plan) { auto stream = ih.get_native_queue(); auto result = cufftSetStream(plan, stream); if (result != CUFFT_SUCCESS) { diff --git a/src/dft/backends/cufft/forward.cpp b/src/dft/backends/cufft/forward.cpp index fb323c085..8a286c988 100644 --- a/src/dft/backends/cufft/forward.cpp +++ b/src/dft/backends/cufft/forward.cpp @@ -39,11 +39,11 @@ namespace oneapi::mkl::dft::cufft { namespace detail { //forward declaration template -std::array get_offsets_fwd(dft::detail::commit_impl *commit); +std::array get_offsets_fwd(dft::detail::commit_impl* commit); template -cufftHandle get_fwd_plan(dft::detail::commit_impl *commit) { - return static_cast *>(commit->get_handle())[0].value(); +cufftHandle get_fwd_plan(dft::detail::commit_impl* commit) { + return static_cast*>(commit->get_handle())[0].value(); } } // namespace detail @@ -51,8 +51,8 @@ cufftHandle get_fwd_plan(dft::detail::commit_impl *commit) { //In-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout) { const std::string func_name = "compute_forward(desc, inout)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -70,34 +70,34 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, offsets[1] *= 2; // offset is supplied in complex but we offset scalar pointer } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_acc = inout.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, plan); - auto inout_native = reinterpret_cast *>( + auto inout_native = reinterpret_cast*>( ih.get_native_mem(inout_acc)); detail::cufft_execute>( - func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), - reinterpret_cast(inout_native + offsets[1])); + func_name, stream, plan, reinterpret_cast(inout_native + offsets[0]), + reinterpret_cast(inout_native + offsets[1])); }); }); } //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type &, sycl::buffer, 1> &, - sycl::buffer, 1> &) { +ONEMKL_EXPORT void compute_forward(descriptor_type&, sycl::buffer, 1>&, + sycl::buffer, 1>&) { throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, inout_re, inout_im)", "cuFFT does not support real-real complex storage."); } //Out-of-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { const std::string func_name = "compute_forward(desc, in, out)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -114,7 +114,7 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer(cgh); auto out_acc = out.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh); @@ -122,12 +122,12 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer( - reinterpret_cast *>( + auto in_native = reinterpret_cast( + reinterpret_cast*>( ih.get_native_mem(in_acc)) + offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast *>( + auto out_native = reinterpret_cast( + reinterpret_cast*>( ih.get_native_mem(out_acc)) + offsets[1]); detail::cufft_execute>( @@ -138,10 +138,10 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer -ONEMKL_EXPORT void compute_forward(descriptor_type &, sycl::buffer, 1> &, - sycl::buffer, 1> &, - sycl::buffer, 1> &, - sycl::buffer, 1> &) { +ONEMKL_EXPORT void compute_forward(descriptor_type&, sycl::buffer, 1>&, + sycl::buffer, 1>&, + sycl::buffer, 1>&, + sycl::buffer, 1>&) { throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, in_re, in_im, out_re, out_im)", "cuFFT does not support real-real complex storage."); } @@ -150,8 +150,8 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &, sycl::buffer -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { const std::string func_name = "compute_forward(desc, inout, dependencies)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -169,7 +169,7 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -186,9 +186,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &, scalar *, - scalar *, - const std::vector &) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type&, scalar*, + scalar*, + const std::vector&) { throw oneapi::mkl::unimplemented("DFT", "compute_forward(desc, inout_re, inout_im, dependencies)", "cuFFT does not support real-real complex storage."); @@ -196,9 +196,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &, scalar -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *in, - bwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* in, + bwd* out, + const std::vector& dependencies) { const std::string func_name = "compute_forward(desc, in, out, dependencies)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -215,7 +215,7 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -232,10 +232,10 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &, scalar *, - scalar *, scalar *, - scalar *, - const std::vector &) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type&, scalar*, + scalar*, scalar*, + scalar*, + const std::vector&) { throw oneapi::mkl::unimplemented( "DFT", "compute_forward(desc, in_re, in_im, out_re, out_im, dependencies)", "cuFFT does not support real-real complex storage."); diff --git a/src/dft/backends/descriptor.cpp b/src/dft/backends/descriptor.cpp index c6f6884f8..5c3e163ca 100644 --- a/src/dft/backends/descriptor.cpp +++ b/src/dft/backends/descriptor.cpp @@ -25,7 +25,7 @@ namespace oneapi::mkl::dft::detail { template -void descriptor::commit(sycl::queue &queue) { +void descriptor::commit(sycl::queue& queue) { if (!pimpl_ || pimpl_->get_queue() != queue) { if (pimpl_) { pimpl_->get_queue().wait(); @@ -34,9 +34,9 @@ void descriptor::commit(sycl::queue &queue) { } pimpl_->commit(values_); } -template void descriptor::commit(sycl::queue &); -template void descriptor::commit(sycl::queue &); -template void descriptor::commit(sycl::queue &); -template void descriptor::commit(sycl::queue &); +template void descriptor::commit(sycl::queue&); +template void descriptor::commit(sycl::queue&); +template void descriptor::commit(sycl::queue&); +template void descriptor::commit(sycl::queue&); } //namespace oneapi::mkl::dft::detail diff --git a/src/dft/backends/mklcpu/backward.cpp b/src/dft/backends/mklcpu/backward.cpp index fe7186630..fe94691bc 100644 --- a/src/dft/backends/mklcpu/backward.cpp +++ b/src/dft/backends/mklcpu/backward.cpp @@ -40,14 +40,14 @@ namespace detail { // BUFFER version // backward a MKLCPU DFT call to the backend, checking that the commit impl is valid. template -inline void check_bwd_commit(dft::descriptor &desc) { +inline void check_bwd_commit(dft::descriptor& desc) { auto commit_handle = dft::detail::get_commit(desc); if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklcpu) { throw mkl::invalid_argument("DFT", "computer_backward", "DFT descriptor has not been commited for MKLCPU"); } - auto mklcpu_desc = reinterpret_cast(commit_handle->get_handle()); + auto mklcpu_desc = reinterpret_cast(commit_handle->get_handle()); MKL_LONG commit_status{ DFTI_UNCOMMITTED }; DftiGetValue(mklcpu_desc[1], DFTI_COMMIT_STATUS, &commit_status); if (commit_status != DFTI_COMMITTED) { @@ -59,7 +59,7 @@ inline void check_bwd_commit(dft::descriptor &desc) { // Throw an mkl::invalid_argument if the runtime param in the descriptor does not match // the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::detail::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -68,26 +68,26 @@ inline auto expect_config(DescT &desc, const char *message) { } // convert the base commit class to derived cpu commit class template -auto get_buffer(commit_t *commit_handle) { - commit_derived_t *derived_commit = - static_cast *>(commit_handle); +auto get_buffer(commit_t* commit_handle) { + commit_derived_t* derived_commit = + static_cast*>(commit_handle); return derived_commit->get_handle_buffer(); } } // namespace detail //In-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout) { detail::expect_config( desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto inout_acc = inout.template get_access(cgh); detail::host_task(cgh, [=]() { @@ -104,20 +104,20 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto re_acc = inout_re.template get_access(cgh); auto im_acc = inout_im.template get_access(cgh); @@ -136,26 +136,26 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config(desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto in_acc = in.template get_access(cgh); auto out_acc = out.template get_access(cgh); detail::host_task(cgh, [=]() { - auto in_ptr = const_cast *>(detail::acc_to_ptr(in_acc)); + auto in_ptr = const_cast*>(detail::acc_to_ptr(in_acc)); DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], in_ptr, detail::acc_to_ptr(out_acc)); if (status != DFTI_NO_ERROR) { @@ -169,22 +169,22 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto inre_acc = in_re.template get_access(cgh); auto inim_acc = in_im.template get_access(cgh); @@ -192,8 +192,8 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, auto outim_acc = out_im.template get_access(cgh); detail::host_task(cgh, [=]() { - auto inre_ptr = const_cast *>(detail::acc_to_ptr(inre_acc)); - auto inim_ptr = const_cast *>(detail::acc_to_ptr(inim_acc)); + auto inre_ptr = const_cast*>(detail::acc_to_ptr(inre_acc)); + auto inim_ptr = const_cast*>(detail::acc_to_ptr(inim_acc)); DFT_ERROR status = DftiComputeBackward(desc_acc[detail::DIR::bwd], inre_ptr, inim_ptr, detail::acc_to_ptr(outre_acc), detail::acc_to_ptr(outim_acc)); @@ -210,18 +210,18 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); detail::host_task(cgh, [=]() { @@ -237,19 +237,19 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); detail::host_task(cgh, [=]() { @@ -265,9 +265,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd *in, - fwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, bwd* in, + fwd* out, + const std::vector& dependencies) { // Check: inplace, complex storage detail::expect_config(desc, @@ -275,10 +275,10 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwdget_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); @@ -295,20 +295,20 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_bwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); diff --git a/src/dft/backends/mklcpu/forward.cpp b/src/dft/backends/mklcpu/forward.cpp index 2e5e2fa88..5d90b7854 100644 --- a/src/dft/backends/mklcpu/forward.cpp +++ b/src/dft/backends/mklcpu/forward.cpp @@ -40,14 +40,14 @@ namespace detail { // BUFFER version // Forward a MKLCPU DFT call to the backend, checking that the commit impl is valid. template -inline void check_fwd_commit(dft::descriptor &desc) { +inline void check_fwd_commit(dft::descriptor& desc) { auto commit_handle = dft::detail::get_commit(desc); if (commit_handle == nullptr || commit_handle->get_backend() != backend::mklcpu) { throw mkl::invalid_argument("DFT", "computer_forward", "DFT descriptor has not been commited for MKLCPU"); } - auto mklcpu_desc = reinterpret_cast(commit_handle->get_handle()); + auto mklcpu_desc = reinterpret_cast(commit_handle->get_handle()); MKL_LONG commit_status{ DFTI_UNCOMMITTED }; DftiGetValue(mklcpu_desc[0], DFTI_COMMIT_STATUS, &commit_status); if (commit_status != DFTI_COMMITTED) { @@ -59,7 +59,7 @@ inline void check_fwd_commit(dft::descriptor &desc) { // Throw an mkl::invalid_argument if the runtime param in the descriptor does not match // the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::detail::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -69,26 +69,26 @@ inline auto expect_config(DescT &desc, const char *message) { // convert the base commit class to derived cpu commit class template -auto get_buffer(commit_t *commit_handle) { - commit_derived_t *derived_commit = - static_cast *>(commit_handle); +auto get_buffer(commit_t* commit_handle) { + commit_derived_t* derived_commit = + static_cast*>(commit_handle); return derived_commit->get_handle_buffer(); } } // namespace detail //In-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout) { detail::expect_config( desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto inout_acc = inout.template get_access(cgh); detail::host_task(cgh, [=]() { @@ -105,20 +105,20 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto re_acc = inout_re.template get_access(cgh); auto im_acc = inout_im.template get_access(cgh); @@ -137,25 +137,25 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //Out-of-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config(desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto in_acc = in.template get_access(cgh); auto out_acc = out.template get_access(cgh); detail::host_task(cgh, [=]() { - auto in_ptr = const_cast *>(detail::acc_to_ptr(in_acc)); + auto in_ptr = const_cast*>(detail::acc_to_ptr(in_acc)); DFT_ERROR status = DftiComputeForward(desc_acc[detail::DIR::fwd], in_ptr, detail::acc_to_ptr(out_acc)); if (status != DFTI_NO_ERROR) { @@ -169,22 +169,22 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - cpu_queue.submit([&](sycl::handler &cgh) { + cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); auto inre_acc = in_re.template get_access(cgh); auto inim_acc = in_im.template get_access(cgh); @@ -192,8 +192,8 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, auto outim_acc = out_im.template get_access(cgh); detail::host_task(cgh, [=]() { - auto inre_ptr = const_cast *>(detail::acc_to_ptr(inre_acc)); - auto inim_ptr = const_cast *>(detail::acc_to_ptr(inim_acc)); + auto inre_ptr = const_cast*>(detail::acc_to_ptr(inre_acc)); + auto inim_ptr = const_cast*>(detail::acc_to_ptr(inim_acc)); DFT_ERROR status = DftiComputeForward(desc_acc[detail::DIR::fwd], inre_ptr, inim_ptr, detail::acc_to_ptr(outre_acc), detail::acc_to_ptr(outim_acc)); @@ -210,18 +210,18 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for placement"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); @@ -238,20 +238,20 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); @@ -268,9 +268,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *in, - bwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* in, + bwd* out, + const std::vector& dependencies) { // Check: inplace detail::expect_config(desc, @@ -278,11 +278,11 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwdget_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); @@ -299,22 +299,22 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for complex storage"); auto commit_handle = dft::detail::get_commit(desc); detail::check_fwd_commit(desc); - sycl::queue &cpu_queue{ commit_handle->get_queue() }; + sycl::queue& cpu_queue{ commit_handle->get_queue() }; auto mklcpu_desc_buffer{ detail::get_buffer(commit_handle) }; - return cpu_queue.submit([&](sycl::handler &cgh) { + return cpu_queue.submit([&](sycl::handler& cgh) { auto desc_acc = mklcpu_desc_buffer.template get_access(cgh); cgh.depends_on(dependencies); diff --git a/src/dft/backends/mklgpu/backward.cpp b/src/dft/backends/mklgpu/backward.cpp index 6c4896c66..5871d9d99 100644 --- a/src/dft/backends/mklgpu/backward.cpp +++ b/src/dft/backends/mklgpu/backward.cpp @@ -39,7 +39,7 @@ namespace detail { /// Forward a MKLGPU DFT call to the backend, checking that the commit impl is valid. /// Assumes backend descriptor values match those of the frontend. template -inline auto compute_backward(dft::detail::descriptor &desc, ArgTs &&... args) { +inline auto compute_backward(dft::detail::descriptor& desc, ArgTs&&... args) { using mklgpu_desc_t = dft::descriptor; using desc_shptr_t = std::shared_ptr; using handle_t = std::pair; @@ -48,7 +48,7 @@ inline auto compute_backward(dft::detail::descriptor &desc, ArgTs &&. throw mkl::invalid_argument("DFT", "compute_backward", "DFT descriptor has not been commited for MKLGPU"); } - auto handle = reinterpret_cast(commit_handle->get_handle()); + auto handle = reinterpret_cast(commit_handle->get_handle()); auto mklgpu_desc = handle->second; // Second because backward DFT. int commit_status{ DFTI_UNCOMMITTED }; mklgpu_desc->get_value(dft::config_param::COMMIT_STATUS, &commit_status); @@ -65,7 +65,7 @@ inline auto compute_backward(dft::detail::descriptor &desc, ArgTs &&. /// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match /// the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::detail::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -78,8 +78,8 @@ inline auto expect_config(DescT &desc, const char *message) { //In-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout) { detail::expect_config( desc, "Unexpected value for placement"); return detail::compute_backward(desc, inout); @@ -87,18 +87,18 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type & /*desc*/, - sycl::buffer, 1> & /*inout_re*/, - sycl::buffer, 1> & /*inout_im*/) { +ONEMKL_EXPORT void compute_backward(descriptor_type& /*desc*/, + sycl::buffer, 1>& /*inout_re*/, + sycl::buffer, 1>& /*inout_im*/) { throw mkl::unimplemented("DFT", "compute_backward", "MKLGPU does not support compute_backward(desc, inout_re, inout_im)."); } //Out-of-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config(desc, "Unexpected value for placement"); @@ -107,11 +107,11 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> & /*in_re*/, - sycl::buffer, 1> & /*in_im*/, - sycl::buffer, 1> & /*out_re*/, - sycl::buffer, 1> & /*out_im*/) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& /*in_re*/, + sycl::buffer, 1>& /*in_im*/, + sycl::buffer, 1>& /*out_re*/, + sycl::buffer, 1>& /*out_im*/) { detail::expect_config( desc, "Unexpected value for complex storage"); @@ -124,8 +124,8 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for placement"); return detail::compute_backward(desc, inout, dependencies); @@ -133,10 +133,10 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type & /*desc*/, - scalar * /*inout_re*/, - scalar * /*inout_im*/, - const std::vector & /*dependencies*/) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& /*desc*/, + scalar* /*inout_re*/, + scalar* /*inout_im*/, + const std::vector& /*dependencies*/) { throw mkl::unimplemented( "DFT", "compute_backward", "MKLGPU does not support compute_backward(desc, inout_re, inout_im, dependencies)."); @@ -144,9 +144,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type & /*desc*/, //Out-of-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd *in, - fwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, bwd* in, + fwd* out, + const std::vector& dependencies) { detail::expect_config(desc, "Unexpected value for placement"); @@ -155,12 +155,12 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, - scalar * /*in_re*/, - scalar * /*in_im*/, - scalar * /*out_re*/, - scalar * /*out_im*/, - const std::vector & /*dependencies*/) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, + scalar* /*in_re*/, + scalar* /*in_im*/, + scalar* /*out_re*/, + scalar* /*out_im*/, + const std::vector& /*dependencies*/) { detail::expect_config( desc, "Unexpected value for complex storage"); diff --git a/src/dft/backends/mklgpu/forward.cpp b/src/dft/backends/mklgpu/forward.cpp index 39da42e45..91039889f 100644 --- a/src/dft/backends/mklgpu/forward.cpp +++ b/src/dft/backends/mklgpu/forward.cpp @@ -46,7 +46,7 @@ namespace detail { /// Forward a MKLGPU DFT call to the backend, checking that the commit impl is valid. /// Assumes backend descriptor values match those of the frontend. template -inline auto compute_forward(dft::detail::descriptor &desc, ArgTs &&... args) { +inline auto compute_forward(dft::detail::descriptor& desc, ArgTs&&... args) { using mklgpu_desc_t = dft::descriptor; using desc_shptr_t = std::shared_ptr; using handle_t = std::pair; @@ -55,7 +55,7 @@ inline auto compute_forward(dft::detail::descriptor &desc, ArgTs &&.. throw mkl::invalid_argument("DFT", "compute_forward", "DFT descriptor has not been commited for MKLGPU"); } - auto handle = reinterpret_cast(commit_handle->get_handle()); + auto handle = reinterpret_cast(commit_handle->get_handle()); auto mklgpu_desc = handle->first; // First because forward DFT. int commit_status{ DFTI_UNCOMMITTED }; mklgpu_desc->get_value(dft::config_param::COMMIT_STATUS, &commit_status); @@ -72,7 +72,7 @@ inline auto compute_forward(dft::detail::descriptor &desc, ArgTs &&.. /// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match /// the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::detail::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -85,8 +85,8 @@ inline auto expect_config(DescT &desc, const char *message) { //In-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout) { detail::expect_config( desc, "Unexpected value for placement"); return detail::compute_forward(desc, inout); @@ -94,17 +94,17 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type & /*desc*/, - sycl::buffer, 1> & /*inout_re*/, - sycl::buffer, 1> & /*inout_im*/) { +ONEMKL_EXPORT void compute_forward(descriptor_type& /*desc*/, + sycl::buffer, 1>& /*inout_re*/, + sycl::buffer, 1>& /*inout_im*/) { throw mkl::unimplemented("DFT", "compute_forward", "MKLGPU does not support compute_forward(desc, inout_re, inout_im)."); } //Out-of-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config(desc, "Unexpected value for placement"); @@ -113,11 +113,11 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> & /*in_re*/, - sycl::buffer, 1> & /*in_im*/, - sycl::buffer, 1> & /*out_re*/, - sycl::buffer, 1> & /*out_im*/) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& /*in_re*/, + sycl::buffer, 1>& /*in_im*/, + sycl::buffer, 1>& /*out_re*/, + sycl::buffer, 1>& /*out_im*/) { detail::expect_config( desc, "Unexpected value for complex storage"); @@ -130,8 +130,8 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *inout, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* inout, + const std::vector& dependencies) { detail::expect_config( desc, "Unexpected value for placement"); return detail::compute_forward(desc, inout, dependencies); @@ -139,10 +139,10 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type & /*desc*/, - scalar * /*inout_re*/, - scalar * /*inout_im*/, - const std::vector & /*dependencies*/) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& /*desc*/, + scalar* /*inout_re*/, + scalar* /*inout_im*/, + const std::vector& /*dependencies*/) { throw mkl::unimplemented( "DFT", "compute_forward", "MKLGPU does not support compute_forward(desc, inout_re, inout_im, dependencies)."); @@ -150,9 +150,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type & /*desc*/, //Out-of-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *in, - bwd *out, - const std::vector &dependencies) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* in, + bwd* out, + const std::vector& dependencies) { detail::expect_config(desc, "Unexpected value for placement"); @@ -161,12 +161,11 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, - scalar * /*in_re*/, - scalar * /*in_im*/, - scalar * /*out_re*/, - scalar * /*out_im*/, - const std::vector & /*dependencies*/) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* /*in_re*/, + scalar* /*in_im*/, + scalar* /*out_re*/, + scalar* /*out_im*/, + const std::vector& /*dependencies*/) { detail::expect_config( desc, "Unexpected value for complex storage"); diff --git a/src/dft/backends/portfft/portfft_helper.hpp b/src/dft/backends/portfft/portfft_helper.hpp index 373865f49..010f2a5e6 100644 --- a/src/dft/backends/portfft/portfft_helper.hpp +++ b/src/dft/backends/portfft/portfft_helper.hpp @@ -31,8 +31,8 @@ namespace pfft = portfft; namespace oneapi::mkl::dft::portfft::detail { template -inline dft::detail::commit_impl *checked_get_commit( - dft::detail::descriptor &desc) { +inline dft::detail::commit_impl* checked_get_commit( + dft::detail::descriptor& desc) { auto commit_handle = dft::detail::get_commit(desc); if (commit_handle == nullptr || commit_handle->get_backend() != backend::portfft) { throw mkl::invalid_argument("dft/backends/portfft", "get_commit", @@ -53,9 +53,9 @@ using storage_type = detail::to_pfft_domain::type::value>>; template -auto get_descriptors(descriptor_type &desc) { +auto get_descriptors(descriptor_type& desc) { auto commit = detail::checked_get_commit(desc); - return reinterpret_cast *>(commit->get_handle()); + return reinterpret_cast*>(commit->get_handle()); } } // namespace oneapi::mkl::dft::portfft::detail diff --git a/src/dft/backends/rocfft/backward.cpp b/src/dft/backends/rocfft/backward.cpp index 5ff0e2a1f..17427f688 100644 --- a/src/dft/backends/rocfft/backward.cpp +++ b/src/dft/backends/rocfft/backward.cpp @@ -38,24 +38,24 @@ namespace oneapi::mkl::dft::rocfft { namespace detail { //forward declaration template -std::array get_offsets_bwd(dft::detail::commit_impl *commit); +std::array get_offsets_bwd(dft::detail::commit_impl* commit); template -rocfft_plan get_bwd_plan(dft::detail::commit_impl *commit) { - return static_cast(commit->get_handle())[1].plan.value(); +rocfft_plan get_bwd_plan(dft::detail::commit_impl* commit) { + return static_cast(commit->get_handle())[1].plan.value(); } template -rocfft_execution_info get_bwd_info(dft::detail::commit_impl *commit) { - return static_cast(commit->get_handle())[1].info.value(); +rocfft_execution_info get_bwd_info(dft::detail::commit_impl* commit) { + return static_cast(commit->get_handle())[1].info.value(); } } // namespace detail // BUFFER version //In-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout) { const std::string func_name = "compute_backward(desc, inout)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -74,15 +74,15 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!"); } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_acc = inout.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - auto inout_native = reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, inout_acc)) + + auto inout_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, inout_acc)) + offsets[0]); detail::execute_checked(func_name, plan, &inout_native, nullptr, info); detail::sync_checked(func_name, stream); @@ -92,9 +92,9 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im) { const std::string func_name = "compute_backward(desc, inout_re, inout_im)"; auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); @@ -108,7 +108,7 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!"); } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_re_acc = inout_re.template get_access(cgh); auto inout_im_acc = inout_im.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh); @@ -116,13 +116,13 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - std::array inout_native{ - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, inout_re_acc)) + - offsets[0]), - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, inout_im_acc)) + - offsets[0]) + std::array inout_native{ + reinterpret_cast(reinterpret_cast*>( + detail::native_mem(ih, inout_re_acc)) + + offsets[0]), + reinterpret_cast(reinterpret_cast*>( + detail::native_mem(ih, inout_im_acc)) + + offsets[0]) }; detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info); detail::sync_checked(func_name, stream); @@ -132,9 +132,9 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config( desc, "Unexpected value for placement"); auto commit = detail::checked_get_commit(desc); @@ -143,7 +143,7 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, auto info = detail::get_bwd_info(commit); auto offsets = detail::get_offsets_bwd(commit); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto in_acc = in.template get_access(cgh); auto out_acc = out.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_backward", cgh); @@ -152,11 +152,11 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, const std::string func_name = "compute_backward(desc, in, out)"; auto stream = detail::setup_stream(func_name, ih, info); - auto in_native = reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, in_acc)) + + auto in_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_acc)) + offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, out_acc)) + + auto out_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_acc)) + offsets[1]); detail::execute_checked(func_name, plan, &in_native, &out_native, info); detail::sync_checked(func_name, stream); @@ -166,18 +166,18 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //Out-of-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_backward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im) { +ONEMKL_EXPORT void compute_backward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im) { auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); auto plan = detail::get_bwd_plan(commit); auto info = detail::get_bwd_info(commit); auto offsets = detail::get_offsets_bwd(commit); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto in_re_acc = in_re.template get_access(cgh); auto in_im_acc = in_im.template get_access(cgh); auto out_re_acc = out_re.template get_access(cgh); @@ -188,21 +188,21 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, const std::string func_name = "compute_backward(desc, in_re, in_im, out_re, out_im)"; auto stream = detail::setup_stream(func_name, ih, info); - std::array in_native{ - reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, in_re_acc)) + + std::array in_native{ + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_re_acc)) + offsets[0]), - reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, in_im_acc)) + + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_im_acc)) + offsets[0]) }; - std::array out_native{ - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, out_re_acc)) + - offsets[1]), - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, out_im_acc)) + - offsets[1]) + std::array out_native{ + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_re_acc)) + + offsets[1]), + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_im_acc)) + + offsets[1]) }; detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info); detail::sync_checked(func_name, stream); @@ -214,8 +214,8 @@ ONEMKL_EXPORT void compute_backward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd *inout, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, fwd* inout, + const std::vector& deps) { const std::string func_name = "compute_backward(desc, inout, deps)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -235,14 +235,14 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - void *inout_ptr = inout; + void* inout_ptr = inout; detail::execute_checked(func_name, plan, &inout_ptr, nullptr, info); detail::sync_checked(func_name, stream); }); @@ -253,9 +253,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& deps) { const std::string func_name = "compute_backward(desc, inout_re, inout_im, deps)"; auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); @@ -269,14 +269,14 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalardepend_on_last_usm_workspace_event_if_rqd(cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - std::array inout_native{ inout_re + offsets[0], inout_im + offsets[0] }; + std::array inout_native{ inout_re + offsets[0], inout_im + offsets[0] }; detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info); detail::sync_checked(func_name, stream); }); @@ -287,9 +287,9 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd *in, - fwd *out, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, bwd* in, + fwd* out, + const std::vector& deps) { detail::expect_config( desc, "Unexpected value for placement"); auto commit = detail::checked_get_commit(desc); @@ -301,7 +301,7 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -309,8 +309,8 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, bwd -ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_backward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& deps) { auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); auto plan = detail::get_bwd_plan(commit); auto info = detail::get_bwd_info(commit); auto offsets = detail::get_offsets_bwd(commit); - sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) { + sycl::event sycl_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); commit->depend_on_last_usm_workspace_event_if_rqd(cgh); @@ -341,8 +341,8 @@ ONEMKL_EXPORT sycl::event compute_backward(descriptor_type &desc, scalar in_native{ in_re + offsets[0], in_im + offsets[0] }; - std::array out_native{ out_re + offsets[1], out_im + offsets[1] }; + std::array in_native{ in_re + offsets[0], in_im + offsets[0] }; + std::array out_native{ out_re + offsets[1], out_im + offsets[1] }; detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info); detail::sync_checked(func_name, stream); }); diff --git a/src/dft/backends/rocfft/execute_helper.hpp b/src/dft/backends/rocfft/execute_helper.hpp index 4dff6831d..1a03d523a 100644 --- a/src/dft/backends/rocfft/execute_helper.hpp +++ b/src/dft/backends/rocfft/execute_helper.hpp @@ -37,8 +37,8 @@ namespace oneapi::mkl::dft::rocfft::detail { template -inline dft::detail::commit_impl *checked_get_commit( - dft::detail::descriptor &desc) { +inline dft::detail::commit_impl* checked_get_commit( + dft::detail::descriptor& desc) { auto commit_handle = dft::detail::get_commit(desc); if (commit_handle == nullptr || commit_handle->get_backend() != backend::rocfft) { throw mkl::invalid_argument("dft/backends/rocfft", "get_commit", @@ -50,7 +50,7 @@ inline dft::detail::commit_impl *checked_get_commit( /// Throw an mkl::invalid_argument if the runtime param in the descriptor does not match /// the expected value. template -inline auto expect_config(DescT &desc, const char *message) { +inline auto expect_config(DescT& desc, const char* message) { dft::config_value actual{ 0 }; desc.get_value(Param, &actual); if (actual != Expected) { @@ -59,11 +59,11 @@ inline auto expect_config(DescT &desc, const char *message) { } template -inline void *native_mem(sycl::interop_handle &ih, Acc &buf) { +inline void* native_mem(sycl::interop_handle& ih, Acc& buf) { return ih.get_native_mem(buf); } -inline hipStream_t setup_stream(const std::string &func, sycl::interop_handle &ih, +inline hipStream_t setup_stream(const std::string& func, sycl::interop_handle& ih, rocfft_execution_info info) { auto stream = ih.get_native_queue(); auto result = rocfft_execution_info_set_stream(info, stream); @@ -75,7 +75,7 @@ inline hipStream_t setup_stream(const std::string &func, sycl::interop_handle &i return stream; } -inline void sync_checked(const std::string &func, hipStream_t stream) { +inline void sync_checked(const std::string& func, hipStream_t stream) { auto result = hipStreamSynchronize(stream); if (result != hipSuccess) { throw oneapi::mkl::exception("dft/backends/rocfft", func, @@ -83,8 +83,8 @@ inline void sync_checked(const std::string &func, hipStream_t stream) { } } -inline void execute_checked(const std::string &func, const rocfft_plan plan, void *in_buffer[], - void *out_buffer[], rocfft_execution_info info) { +inline void execute_checked(const std::string& func, const rocfft_plan plan, void* in_buffer[], + void* out_buffer[], rocfft_execution_info info) { auto result = rocfft_execute(plan, in_buffer, out_buffer, info); if (result != rocfft_status_success) { throw oneapi::mkl::exception("dft/backends/rocfft", func, diff --git a/src/dft/backends/rocfft/forward.cpp b/src/dft/backends/rocfft/forward.cpp index 70d3d0f97..100a5abec 100644 --- a/src/dft/backends/rocfft/forward.cpp +++ b/src/dft/backends/rocfft/forward.cpp @@ -40,16 +40,16 @@ namespace oneapi::mkl::dft::rocfft { namespace detail { //forward declaration template -std::array get_offsets_fwd(dft::detail::commit_impl *commit); +std::array get_offsets_fwd(dft::detail::commit_impl* commit); template -rocfft_plan get_fwd_plan(dft::detail::commit_impl *commit) { - return static_cast(commit->get_handle())[0].plan.value(); +rocfft_plan get_fwd_plan(dft::detail::commit_impl* commit) { + return static_cast(commit->get_handle())[0].plan.value(); } template -rocfft_execution_info get_fwd_info(dft::detail::commit_impl *commit) { - return static_cast(commit->get_handle())[0].info.value(); +rocfft_execution_info get_fwd_info(dft::detail::commit_impl* commit) { + return static_cast(commit->get_handle())[0].info.value(); } } // namespace detail @@ -57,8 +57,8 @@ rocfft_execution_info get_fwd_info(dft::detail::commit_impl *commit) //In-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout) { const std::string func_name = "compute_forward(desc, inout)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -77,15 +77,15 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!"); } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_acc = inout.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - auto inout_native = reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, inout_acc)) + + auto inout_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, inout_acc)) + offsets[0]); detail::execute_checked(func_name, plan, &inout_native, nullptr, info); detail::sync_checked(func_name, stream); @@ -95,9 +95,9 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform, using config_param::COMPLEX_STORAGE=config_value::REAL_REAL data format template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &inout_re, - sycl::buffer, 1> &inout_im) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& inout_re, + sycl::buffer, 1>& inout_im) { const std::string func_name = "compute_forward(desc, inout_re, inout_im)"; auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); @@ -111,7 +111,7 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, "rocFFT requires input and output offsets (first value in strides) to be equal for in-place transforms!"); } - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto inout_re_acc = inout_re.template get_access(cgh); auto inout_im_acc = inout_im.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh); @@ -119,13 +119,13 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - std::array inout_native{ - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, inout_re_acc)) + - offsets[0]), - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, inout_im_acc)) + - offsets[0]) + std::array inout_native{ + reinterpret_cast(reinterpret_cast*>( + detail::native_mem(ih, inout_re_acc)) + + offsets[0]), + reinterpret_cast(reinterpret_cast*>( + detail::native_mem(ih, inout_im_acc)) + + offsets[0]) }; detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info); detail::sync_checked(func_name, stream); @@ -135,8 +135,8 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //Out-of-place transform template -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer, 1> &in, - sycl::buffer, 1> &out) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, sycl::buffer, 1>& in, + sycl::buffer, 1>& out) { detail::expect_config( desc, "Unexpected value for placement"); auto commit = detail::checked_get_commit(desc); @@ -145,7 +145,7 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer(cgh); auto out_acc = out.template get_access(cgh); commit->add_buffer_workspace_dependency_if_rqd("compute_forward", cgh); @@ -154,11 +154,11 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer( - reinterpret_cast *>(detail::native_mem(ih, in_acc)) + + auto in_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_acc)) + offsets[0]); - auto out_native = reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, out_acc)) + + auto out_native = reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_acc)) + offsets[1]); detail::execute_checked(func_name, plan, &in_native, &out_native, info); detail::sync_checked(func_name, stream); @@ -168,18 +168,18 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, sycl::buffer -ONEMKL_EXPORT void compute_forward(descriptor_type &desc, - sycl::buffer, 1> &in_re, - sycl::buffer, 1> &in_im, - sycl::buffer, 1> &out_re, - sycl::buffer, 1> &out_im) { +ONEMKL_EXPORT void compute_forward(descriptor_type& desc, + sycl::buffer, 1>& in_re, + sycl::buffer, 1>& in_im, + sycl::buffer, 1>& out_re, + sycl::buffer, 1>& out_im) { auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); auto plan = detail::get_fwd_plan(commit); auto info = detail::get_fwd_info(commit); auto offsets = detail::get_offsets_fwd(commit); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto in_re_acc = in_re.template get_access(cgh); auto in_im_acc = in_im.template get_access(cgh); auto out_re_acc = out_re.template get_access(cgh); @@ -190,21 +190,21 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, const std::string func_name = "compute_forward(desc, in_re, in_im, out_re, out_im)"; auto stream = detail::setup_stream(func_name, ih, info); - std::array in_native{ - reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, in_re_acc)) + + std::array in_native{ + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_re_acc)) + offsets[0]), - reinterpret_cast( - reinterpret_cast *>(detail::native_mem(ih, in_im_acc)) + + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, in_im_acc)) + offsets[0]) }; - std::array out_native{ - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, out_re_acc)) + - offsets[1]), - reinterpret_cast(reinterpret_cast *>( - detail::native_mem(ih, out_im_acc)) + - offsets[1]) + std::array out_native{ + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_re_acc)) + + offsets[1]), + reinterpret_cast( + reinterpret_cast*>(detail::native_mem(ih, out_im_acc)) + + offsets[1]) }; detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info); detail::sync_checked(func_name, stream); @@ -216,8 +216,8 @@ ONEMKL_EXPORT void compute_forward(descriptor_type &desc, //In-place transform template -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *inout, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* inout, + const std::vector& deps) { const std::string func_name = "compute_forward(desc, inout, deps)"; detail::expect_config( desc, "Unexpected value for placement"); @@ -237,14 +237,14 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - void *inout_ptr = inout; + void* inout_ptr = inout; detail::execute_checked(func_name, plan, &inout_ptr, nullptr, info); detail::sync_checked(func_name, stream); }); @@ -255,9 +255,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *inout_re, - scalar *inout_im, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* inout_re, + scalar* inout_im, + const std::vector& deps) { const std::string func_name = "compute_forward(desc, inout_re, inout_im, deps)"; auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); @@ -271,13 +271,13 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalardepend_on_last_usm_workspace_event_if_rqd(cgh); cgh.host_task([=](sycl::interop_handle ih) { auto stream = detail::setup_stream(func_name, ih, info); - std::array inout_native{ inout_re + offsets[0], inout_im + offsets[0] }; + std::array inout_native{ inout_re + offsets[0], inout_im + offsets[0] }; detail::execute_checked(func_name, plan, inout_native.data(), nullptr, info); detail::sync_checked(func_name, stream); }); @@ -288,9 +288,9 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd *in, - bwd *out, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, fwd* in, + bwd* out, + const std::vector& deps) { detail::expect_config( desc, "Unexpected value for placement"); auto commit = detail::checked_get_commit(desc); @@ -302,7 +302,7 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwddepend_on_last_usm_workspace_event_if_rqd(cgh); @@ -310,8 +310,8 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, fwd -ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar *in_re, - scalar *in_im, - scalar *out_re, - scalar *out_im, - const std::vector &deps) { +ONEMKL_EXPORT sycl::event compute_forward(descriptor_type& desc, scalar* in_re, + scalar* in_im, + scalar* out_re, + scalar* out_im, + const std::vector& deps) { auto commit = detail::checked_get_commit(desc); auto queue = commit->get_queue(); auto plan = detail::get_fwd_plan(commit); auto info = detail::get_fwd_info(commit); auto offsets = detail::get_offsets_fwd(commit); - sycl::event sycl_event = queue.submit([&](sycl::handler &cgh) { + sycl::event sycl_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); commit->depend_on_last_usm_workspace_event_if_rqd(cgh); @@ -342,8 +342,8 @@ ONEMKL_EXPORT sycl::event compute_forward(descriptor_type &desc, scalar in_native{ in_re + offsets[0], in_im + offsets[0] }; - std::array out_native{ out_re + offsets[1], out_im + offsets[1] }; + std::array in_native{ in_re + offsets[0], in_im + offsets[0] }; + std::array out_native{ out_re + offsets[1], out_im + offsets[1] }; detail::execute_checked(func_name, plan, in_native.data(), out_native.data(), info); detail::sync_checked(func_name, stream); }); diff --git a/src/include/allocator_helper.hpp b/src/include/allocator_helper.hpp index 8ea802dd1..2678dc114 100644 --- a/src/include/allocator_helper.hpp +++ b/src/include/allocator_helper.hpp @@ -29,7 +29,7 @@ namespace oneapi { namespace mkl { -static inline void *aligned_alloc(size_t align, size_t size) { +static inline void* aligned_alloc(size_t align, size_t size) { #ifdef _WIN64 return ::_aligned_malloc(size, align); #else @@ -37,7 +37,7 @@ static inline void *aligned_alloc(size_t align, size_t size) { #endif } -static inline void aligned_free(void *p) { +static inline void aligned_free(void* p) { #ifdef _WIN64 ::_aligned_free(p); #else diff --git a/src/include/function_table_initializer.hpp b/src/include/function_table_initializer.hpp index 24b2ffb86..5f9d163cd 100644 --- a/src/include/function_table_initializer.hpp +++ b/src/include/function_table_initializer.hpp @@ -30,7 +30,7 @@ #ifdef __linux__ #include -#define LIB_TYPE void * +#define LIB_TYPE void* #define GET_LIB_HANDLE(libname) dlopen((libname), RTLD_LAZY | RTLD_GLOBAL) #define GET_FUNC(lib, fn) dlsym(lib, (fn)) #define FREE_LIB_HANDLE(libname) dlclose(libname) @@ -59,7 +59,7 @@ class table_initializer { using dlhandle = std::unique_ptr; public: - function_table_t &operator[](oneapi::mkl::device key) { + function_table_t& operator[](oneapi::mkl::device key) { auto lib = tables.find(key); if (lib != tables.end()) return lib->second; @@ -90,10 +90,10 @@ class table_initializer { } #endif - function_table_t &add_table(oneapi::mkl::device key) { + function_table_t& add_table(oneapi::mkl::device key) { dlhandle handle; // check all available libraries for the key(device) - for (const char *libname : libraries[domain_id][key]) { + for (const char* libname : libraries[domain_id][key]) { handle = dlhandle{ ::GET_LIB_HANDLE(libname) }; if (handle) break; @@ -103,7 +103,7 @@ class table_initializer { throw mkl::backend_not_found(); } auto t = - reinterpret_cast(::GET_FUNC(handle.get(), table_names[domain_id])); + reinterpret_cast(::GET_FUNC(handle.get(), table_names[domain_id])); if (!t) { std::cerr << ERROR_MSG << '\n'; diff --git a/src/lapack/backends/cusolver/cusolver_batch.cpp b/src/lapack/backends/cusolver/cusolver_batch.cpp index 59fa47f84..852c1b8fc 100644 --- a/src/lapack/backends/cusolver/cusolver_batch.cpp +++ b/src/lapack/backends/cusolver/cusolver_batch.cpp @@ -31,24 +31,24 @@ namespace cusolver { // BATCH BUFFER API template -inline void geqrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void geqrf_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, stride_a, stride_tau, batch_size, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -62,10 +62,10 @@ inline void geqrf_batch(const char *func_name, Func func, sycl::queue &queue, st } #define GEQRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, \ + void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, \ std::int64_t stride_tau, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size); \ } @@ -78,10 +78,10 @@ GEQRF_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZgeqrf) #undef GEQRF_STRIDED_BATCH_LAUNCHER template -inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +inline void getri_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; @@ -91,7 +91,7 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st sycl::buffer ipiv32(sycl::range<1>{ ipiv32_size }); sycl::buffer devInfo{ batch_size }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv_acc = sycl::accessor{ ipiv, cgh, sycl::read_only }; auto ipiv32_acc = sycl::accessor{ ipiv32, cgh, sycl::write_only }; cgh.parallel_for(sycl::range<1>{ ipiv32_size }, [=](sycl::id<1> index) { @@ -102,7 +102,7 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st // getri_batched is contained within cublas, not cusolver. For this reason // we need to use cublas types instead of cusolver types (as is needed for // other lapack routines) - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { using blas::cublas::cublas_error; sycl::accessor a_acc{ a, cgh, sycl::read_only }; @@ -110,7 +110,7 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st sycl::accessor ipiv32_acc{ ipiv32, cgh }; sycl::accessor devInfo_acc{ devInfo, cgh, sycl::write_only }; - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { cublasStatus_t err; CUresult cuda_result; cublasHandle_t cublas_handle; @@ -118,24 +118,24 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st CUstream cu_stream = sycl::get_native(queue); CUBLAS_ERROR_FUNC(cublasSetStream, err, cublas_handle, cu_stream); - auto a_ = sc.get_mem(a_acc); - auto scratch_ = sc.get_mem(scratch_acc); - auto ipiv32_ = sc.get_mem(ipiv32_acc); - auto info_ = sc.get_mem(devInfo_acc); + auto a_ = sc.get_mem(a_acc); + auto scratch_ = sc.get_mem(scratch_acc); + auto ipiv32_ = sc.get_mem(ipiv32_acc); + auto info_ = sc.get_mem(devInfo_acc); CUdeviceptr a_dev; - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); + auto** a_dev_ = reinterpret_cast(a_dev); CUdeviceptr scratch_dev; - cuDataType **scratch_batched = + cuDataType** scratch_batched = create_ptr_list_from_stride(scratch_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T *) * batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T*) * batch_size); CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, scratch_dev, scratch_batched, - sizeof(T *) * batch_size); - auto **scratch_dev_ = reinterpret_cast(scratch_dev); + sizeof(T*) * batch_size); + auto** scratch_dev_ = reinterpret_cast(scratch_dev); CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, cublas_handle, n, a_dev_, lda, ipiv32_, scratch_dev_, lda, info_, batch_size) @@ -148,7 +148,7 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st }); // The inverted matrices stored in scratch_ need to be stored in a_ - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { sycl::accessor a_acc{ a, cgh, sycl::write_only }; sycl::accessor scratch_acc{ scratchpad, cgh, sycl::read_only }; cgh.parallel_for(sycl::range<1>{ static_cast( @@ -156,7 +156,7 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st [=](sycl::id<1> index) { a_acc[index] = scratch_acc[index]; }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { sycl::accessor ipiv32_acc{ ipiv32, cgh, sycl::read_only }; sycl::accessor ipiv_acc{ ipiv, cgh, sycl::write_only }; cgh.parallel_for(sycl::range<1>{ static_cast(ipiv32_size) }, @@ -168,10 +168,10 @@ inline void getri_batch(const char *func_name, Func func, sycl::queue &queue, st } #define GETRI_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - std::int64_t stride_a, sycl::buffer &ipiv, \ + void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + std::int64_t stride_a, sycl::buffer& ipiv, \ std::int64_t stride_ipiv, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return getri_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, n, a, lda, stride_a, ipiv, \ stride_ipiv, batch_size, scratchpad, scratchpad_size); \ } @@ -184,12 +184,12 @@ GETRI_STRIDED_BATCH_LAUNCHER(std::complex, cublasZgetriBatched) #undef GETRI_STRIDED_BATCH_LAUNCHER template -inline void getrs_batch(const char *func_name, Func func, sycl::queue &queue, +inline void getrs_batch(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; @@ -201,7 +201,7 @@ inline void getrs_batch(const char *func_name, Func func, sycl::queue &queue, std::uint64_t ipiv_size = stride_ipiv * batch_size; sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { @@ -209,16 +209,16 @@ inline void getrs_batch(const char *func_name, Func func, sycl::queue &queue, }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv_acc = ipiv32.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv_ = sc.get_mem(ipiv_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv_ = sc.get_mem(ipiv_acc); + auto b_ = sc.get_mem(b_acc); cusolverStatus_t err; // Does not use scratch so call cuSolver asynchronously and sync at end @@ -233,12 +233,12 @@ inline void getrs_batch(const char *func_name, Func func, sycl::queue &queue, } #define GETRS_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, \ - std::int64_t stride_a, sycl::buffer &ipiv, \ - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, \ + void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, \ + std::int64_t stride_a, sycl::buffer& ipiv, \ + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, \ std::int64_t stride_b, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, \ stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, \ scratchpad_size); \ @@ -252,10 +252,10 @@ GETRS_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZgetrs) #undef GETRS_STRIDED_BATCH_LAUNCHER template -inline void getrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +inline void getrf_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; @@ -268,17 +268,17 @@ inline void getrf_batch(const char *func_name, Func func, sycl::queue &queue, st sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); sycl::buffer devInfo{ batch_size }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv32_acc = ipiv32.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv_ = sc.get_mem(ipiv32_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv_ = sc.get_mem(ipiv32_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -290,7 +290,7 @@ inline void getrf_batch(const char *func_name, Func func, sycl::queue &queue, st }); // Copy from 32-bit USM to 64-bit - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, @@ -301,10 +301,10 @@ inline void getrf_batch(const char *func_name, Func func, sycl::queue &queue, st } #define GETRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, \ + void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, \ std::int64_t stride_ipiv, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a, \ ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); \ } @@ -317,25 +317,25 @@ GETRF_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZgetrf) #undef GETRF_STRIDED_BATCH_LAUNCHER template -inline void orgqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +inline void orgqr_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -349,10 +349,10 @@ inline void orgqr_batch(const char *func_name, Func func, sycl::queue &queue, st } #define ORGQR_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, \ - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, \ + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size); \ } @@ -363,30 +363,30 @@ ORGQR_STRIDED_BATCH_LAUNCHER(double, cusolverDnDorgqr) #undef ORGQR_STRIDED_BATCH_LAUNCHER template -inline void potrf_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, +inline void potrf_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, stride_a, batch_size, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); CUdeviceptr a_dev; CUresult cuda_result; cusolverStatus_t err; - auto a_ = sc.get_mem(a_acc); + auto a_ = sc.get_mem(a_acc); // Transform ptr and stride to list of ptr's - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); + auto** a_dev_ = reinterpret_cast(a_dev); CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), (int)n, a_dev_, (int)lda, nullptr, (int)batch_size); @@ -399,9 +399,9 @@ inline void potrf_batch(const char *func_name, Func func, sycl::queue &queue, // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRF_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, \ - std::int64_t batch_size, sycl::buffer &scratchpad, \ + void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, \ + std::int64_t batch_size, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, stride_a, \ batch_size, scratchpad, scratchpad_size); \ @@ -415,11 +415,11 @@ POTRF_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZpotrfBatched) #undef POTRF_STRIDED_BATCH_LAUNCHER template -inline void potrs_batch(const char *func_name, Func func, sycl::queue &queue, +inline void potrs_batch(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; @@ -429,28 +429,28 @@ inline void potrs_batch(const char *func_name, Func func, sycl::queue &queue, if (nrhs != 1) throw unimplemented("lapack", "potrs_batch", "cusolver potrs_batch only supports nrhs = 1"); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); CUdeviceptr a_dev, b_dev; cusolverStatus_t err; CUresult cuda_result; - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); // Transform ptr and stride to list of ptr's - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - cuDataType **b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T *) * batch_size); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + cuDataType** b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T*) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); - auto **b_dev_ = reinterpret_cast(b_dev); + auto** a_dev_ = reinterpret_cast(a_dev); + auto** b_dev_ = reinterpret_cast(b_dev); CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), (int)n, (int)nrhs, a_dev_, (int)lda, b_dev_, ldb, nullptr, @@ -466,11 +466,11 @@ inline void potrs_batch(const char *func_name, Func func, sycl::queue &queue, // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRS_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, \ - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, \ + void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, \ + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, \ std::int64_t stride_b, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, \ stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); \ } @@ -483,25 +483,25 @@ POTRS_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZpotrsBatched) #undef POTRS_STRIDED_BATCH_LAUNCHER template -inline void ungqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, +inline void ungqr_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -515,10 +515,10 @@ inline void ungqr_batch(const char *func_name, Func func, sycl::queue &queue, st } #define UNGQR_STRIDED_BATCH_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, \ - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, \ + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size); \ } @@ -531,22 +531,22 @@ UNGQR_STRIDED_BATCH_LAUNCHER(std::complex, cusolverDnZungqr) // BATCH USM API template -inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, - T *tau, std::int64_t stride_tau, std::int64_t batch_size, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event geqrf_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, std::int64_t stride_a, + T* tau, std::int64_t stride_tau, std::int64_t batch_size, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, stride_a, stride_tau, batch_size, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -562,11 +562,11 @@ inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &qu } #define GEQRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t stride_a, TYPE *tau, \ - std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad, \ + sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t stride_a, TYPE* tau, \ + std::int64_t stride_tau, std::int64_t batch_size, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size, \ dependencies); \ @@ -580,24 +580,24 @@ GEQRF_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgeqrf) #undef GEQRF_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda, T **tau, - std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, +inline sycl::event geqrf_batch(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, T** a, std::int64_t* lda, T** tau, + std::int64_t group_count, std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(group_count, scratchpad_size); for (int64_t i = 0; i < group_count; ++i) overflow_check(m[i], n[i], lda[i], group_sizes[i]); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); int64_t global_id = 0; cusolverStatus_t err; @@ -618,9 +618,9 @@ inline sycl::event geqrf_batch(const char *func_name, Func func, sycl::queue &qu #define GEQRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event geqrf_batch( \ - sycl::queue &queue, std::int64_t *m, std::int64_t *n, TYPE **a, std::int64_t *lda, \ - TYPE **tau, std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad, \ - std::int64_t scratchpad_size, const std::vector &dependencies) { \ + sycl::queue& queue, std::int64_t* m, std::int64_t* n, TYPE** a, std::int64_t* lda, \ + TYPE** tau, std::int64_t group_count, std::int64_t* group_sizes, TYPE* scratchpad, \ + std::int64_t scratchpad_size, const std::vector& dependencies) { \ return geqrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -633,11 +633,11 @@ GEQRF_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgeqrf) #undef GEQRF_BATCH_LAUNCHER_USM template -inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event getrf_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size); @@ -646,17 +646,17 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu // To get around the limitation. // Allocate memory with 32-bit ints then copy over results std::uint64_t ipiv_size = stride_ipiv * batch_size; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); - int *devInfo = (int *)malloc_device(sizeof(int) * batch_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); + int* devInfo = (int*)malloc_device(sizeof(int) * batch_size, queue); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratchpad_ = reinterpret_cast(scratchpad); - auto ipiv_ = reinterpret_cast(ipiv32); + auto a_ = reinterpret_cast(a); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratchpad_ = reinterpret_cast(scratchpad); + auto ipiv_ = reinterpret_cast(ipiv32); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -668,14 +668,14 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu }); // Copy from 32-bit USM to 64-bit - sycl::event done_casting = queue.submit([&](sycl::handler &cgh) { + sycl::event done_casting = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv[index] = ipiv32[index]; }); }); // Enqueue free memory, don't return event as not-neccessary for user to wait for ipiv32 being released - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done_casting); cgh.host_task([=](sycl::interop_handle ih) { sycl::free(ipiv32, queue); }); }); @@ -688,11 +688,11 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu } #define GETRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, \ - std::int64_t stride_ipiv, std::int64_t batch_size, TYPE *scratchpad, \ + sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, \ + std::int64_t stride_ipiv, std::int64_t batch_size, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, stride_a, \ ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, \ dependencies); \ @@ -706,12 +706,12 @@ GETRF_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgetrf) #undef GETRF_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, T **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, T *scratchpad, +inline sycl::event getrf_batch(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, T** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; int64_t batch_size = 0; @@ -724,19 +724,19 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu // cuSolver legacy api does not accept 64-bit ints. // To get around the limitation. // Allocate memory with 32-bit ints then copy over results - int **ipiv32 = (int **)malloc(sizeof(int *) * batch_size); + int** ipiv32 = (int**)malloc(sizeof(int*) * batch_size); int64_t global_id = 0; for (int64_t group_id = 0; group_id < group_count; ++group_id) for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id) - ipiv32[global_id] = (int *)malloc_device(sizeof(int) * n[group_id], queue); - int *devInfo = (int *)malloc_device(sizeof(int) * batch_size, queue); + ipiv32[global_id] = (int*)malloc_device(sizeof(int) * n[group_id], queue); + int* devInfo = (int*)malloc_device(sizeof(int) * batch_size, queue); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto scratch_ = reinterpret_cast(scratchpad); int64_t global_id = 0; cusolverStatus_t err; @@ -757,10 +757,10 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu for (int64_t group_id = 0, global_id = 0; group_id < group_count; ++group_id) { uint64_t ipiv_size = n[group_id]; for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id) { - int64_t *d_ipiv = ipiv[global_id]; - int *d_ipiv32 = ipiv32[global_id]; + int64_t* d_ipiv = ipiv[global_id]; + int* d_ipiv32 = ipiv32[global_id]; - sycl::event e = queue.submit([&](sycl::handler &cgh) { + sycl::event e = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { d_ipiv[index] = d_ipiv32[index]; }); @@ -770,7 +770,7 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu } // Enqueue free memory - sycl::event done_freeing = queue.submit([&](sycl::handler &cgh) { + sycl::event done_freeing = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(casting_dependencies); cgh.host_task([=](sycl::interop_handle ih) { for (int64_t global_id = 0; global_id < batch_size; ++global_id) @@ -787,11 +787,11 @@ inline sycl::event getrf_batch(const char *func_name, Func func, sycl::queue &qu } #define GETRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, TYPE **a, \ - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, \ - std::int64_t *group_sizes, TYPE *scratchpad, \ + sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, TYPE** a, \ + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, \ + std::int64_t* group_sizes, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return getrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -804,20 +804,20 @@ GETRF_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgetrf) #undef GETRS_BATCH_LAUNCHER_USM template -sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t n, T *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, T *scratchpad, +sycl::event getri_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t n, T* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, stride_a, stride_ipiv, batch_size, scratchpad_size); std::uint64_t ipiv32_size = n * batch_size; - int *ipiv32 = sycl::malloc_device(ipiv32_size, queue); - int *devInfo = sycl::malloc_device(batch_size, queue); + int* ipiv32 = sycl::malloc_device(ipiv32_size, queue); + int* devInfo = sycl::malloc_device(batch_size, queue); - sycl::event done_casting = queue.submit([&](sycl::handler &cgh) { + sycl::event done_casting = queue.submit([&](sycl::handler& cgh) { cgh.parallel_for( sycl::range<1>{ static_cast(ipiv32_size) }, [=](sycl::id<1> index) { ipiv32[index] = static_cast(ipiv[(index / n) * stride_ipiv + index % n]); @@ -827,13 +827,13 @@ sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, st // getri_batched is contained within cublas, not cusolver. For this reason // we need to use cublas types instead of cusolver types (as is needed for // other lapack routines) - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { using blas::cublas::cublas_error; cgh.depends_on(done_casting); cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { cublasStatus_t err; CUresult cuda_result; cublasHandle_t cublas_handle; @@ -842,20 +842,20 @@ sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, st CUBLAS_ERROR_FUNC(cublasSetStream, err, cublas_handle, cu_stream); CUdeviceptr a_dev; - auto *a_ = reinterpret_cast(a); - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); + auto* a_ = reinterpret_cast(a); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); + auto** a_dev_ = reinterpret_cast(a_dev); CUdeviceptr scratch_dev; - auto *scratch_ = reinterpret_cast(scratchpad); - cuDataType **scratch_batched = + auto* scratch_ = reinterpret_cast(scratchpad); + cuDataType** scratch_batched = create_ptr_list_from_stride(scratch_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T *) * batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &scratch_dev, sizeof(T*) * batch_size); CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, scratch_dev, scratch_batched, - sizeof(T *) * batch_size); - auto **scratch_dev_ = reinterpret_cast(scratch_dev); + sizeof(T*) * batch_size); + auto** scratch_dev_ = reinterpret_cast(scratch_dev); CUBLAS_ERROR_FUNC_T_SYNC(func_name, func, err, cublas_handle, n, a_dev_, lda, ipiv32, scratch_dev_, lda, devInfo, batch_size) @@ -868,14 +868,14 @@ sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, st }); // The inverted matrices stored in scratch_ need to be stored in a_ - auto copy1 = queue.submit([&](sycl::handler &cgh) { + auto copy1 = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for( sycl::range<1>{ static_cast(stride_a * (batch_size - 1) + lda * n) }, [=](sycl::id<1> index) { a[index] = scratchpad[index]; }); }); - auto copy2 = queue.submit([&](sycl::handler &cgh) { + auto copy2 = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for( sycl::range<1>{ static_cast(ipiv32_size) }, [=](sycl::id<1> index) { @@ -891,9 +891,9 @@ sycl::event getri_batch(const char *func_name, Func func, sycl::queue &queue, st #define GETRI_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event getri_batch( \ - sycl::queue &queue, std::int64_t n, TYPE *a, std::int64_t lda, std::int64_t stride_a, \ - std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, TYPE *scratchpad, \ - std::int64_t scratchpad_size, const std::vector &dependencies) { \ + sycl::queue& queue, std::int64_t n, TYPE* a, std::int64_t lda, std::int64_t stride_a, \ + std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, TYPE* scratchpad, \ + std::int64_t scratchpad_size, const std::vector& dependencies) { \ return getri_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, n, a, lda, stride_a, ipiv, \ stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); \ } @@ -905,41 +905,41 @@ GETRI_BATCH_LAUNCHER_USM(std::complex, cublasZgetriBatched) #undef GETRI_BATCH_LAUNCHER_USM -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } template -inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event getrs_batch(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - T *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, T *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, T *scratchpad, + T* a, std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, T* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, stride_ipiv, stride_b, batch_size, scratchpad_size); @@ -948,22 +948,22 @@ inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &qu // To get around the limitation. // Create new memory and convert 64-bit values. std::uint64_t ipiv_size = stride_ipiv * batch_size; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv32[index] = static_cast(ipiv[index]); }); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.depends_on(done_casting); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto ipiv_ = reinterpret_cast(ipiv32); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto ipiv_ = reinterpret_cast(ipiv32); + auto b_ = reinterpret_cast(b); cusolverStatus_t err; // Does not use scratch so call cuSolver asynchronously and sync at end @@ -982,12 +982,12 @@ inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &qu } #define GETRS_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t stride_a, \ - std::int64_t *ipiv, std::int64_t stride_ipiv, TYPE *b, \ + sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, TYPE* a, std::int64_t lda, std::int64_t stride_a, \ + std::int64_t* ipiv, std::int64_t stride_ipiv, TYPE* b, \ std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, \ stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, \ scratchpad_size, dependencies); \ @@ -1001,13 +1001,13 @@ GETRS_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgetrs) #undef GETRS_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - T **a, std::int64_t *lda, std::int64_t **ipiv, T **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, T *scratchpad, +inline sycl::event getrs_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + T** a, std::int64_t* lda, std::int64_t** ipiv, T** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; int64_t batch_size = 0; @@ -1022,17 +1022,17 @@ inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &qu // an array of 64-bit ints in device memory. Each vec of ipiv // values need to be converted from 64-bit to 32-bit. The list // must stay on host. - int **ipiv32 = (int **)malloc(sizeof(int *) * batch_size); + int** ipiv32 = (int**)malloc(sizeof(int*) * batch_size); std::vector casting_dependencies(batch_size); int64_t global_id = 0; for (int64_t group_id = 0; group_id < group_count; ++group_id) { for (int64_t local_id = 0; local_id < group_sizes[group_id]; ++local_id, ++global_id) { uint64_t ipiv_size = n[group_id]; - int *d_group_ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* d_group_ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); ipiv32[global_id] = d_group_ipiv32; - int64_t *d_group_ipiv = ipiv[global_id]; + int64_t* d_group_ipiv = ipiv[global_id]; - auto e = queue.submit([&](sycl::handler &cgh) { + auto e = queue.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { d_group_ipiv32[index] = static_cast(d_group_ipiv[index]); }); @@ -1041,14 +1041,14 @@ inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &qu } } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.depends_on(casting_dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cusolverStatus_t err; int64_t global_id = 0; @@ -1075,10 +1075,10 @@ inline sycl::event getrs_batch(const char *func_name, Func func, sycl::queue &qu #define GETRS_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event getrs_batch( \ - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, \ - TYPE **a, std::int64_t *lda, std::int64_t **ipiv, TYPE **b, std::int64_t *ldb, \ - std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad, \ - std::int64_t scratchpad_size, const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, \ + TYPE** a, std::int64_t* lda, std::int64_t** ipiv, TYPE** b, std::int64_t* ldb, \ + std::int64_t group_count, std::int64_t* group_sizes, TYPE* scratchpad, \ + std::int64_t scratchpad_size, const std::vector& dependencies) { \ return getrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, \ ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, \ dependencies); \ @@ -1092,22 +1092,22 @@ GETRS_BATCH_LAUNCHER_USM(std::complex, cusolverDnZgetrs) #undef GETRS_BATCH_LAUNCHER_USM template -inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, - std::int64_t stride_a, T *tau, std::int64_t stride_tau, - std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event orgqr_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, + std::int64_t stride_a, T* tau, std::int64_t stride_tau, + std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -1123,11 +1123,11 @@ inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &qu } #define ORGQR_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - TYPE *a, std::int64_t lda, std::int64_t stride_a, TYPE *tau, \ - std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad, \ + sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + TYPE* a, std::int64_t lda, std::int64_t stride_a, TYPE* tau, \ + std::int64_t stride_tau, std::int64_t batch_size, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size, \ dependencies); \ @@ -1139,25 +1139,25 @@ ORGQR_STRIDED_BATCH_LAUNCHER_USM(double, cusolverDnDorgqr) #undef ORGQR_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a, - std::int64_t *lda, T **tau, std::int64_t group_count, - std::int64_t *group_sizes, T *scratchpad, +inline sycl::event orgqr_batch(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, T** a, + std::int64_t* lda, T** tau, std::int64_t group_count, + std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(group_count, scratchpad_size); for (int64_t i = 0; i < group_count; ++i) overflow_check(m[i], n[i], k[i], lda[i], group_sizes[i]); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); int64_t global_id = 0; cusolverStatus_t err; @@ -1178,11 +1178,11 @@ inline sycl::event orgqr_batch(const char *func_name, Func func, sycl::queue &qu } #define ORGQR_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, \ - TYPE **a, std::int64_t *lda, TYPE **tau, std::int64_t group_count, \ - std::int64_t *group_sizes, TYPE *scratchpad, \ + sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, \ + TYPE** a, std::int64_t* lda, TYPE** tau, std::int64_t group_count, \ + std::int64_t* group_sizes, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return orgqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -1193,31 +1193,31 @@ ORGQR_BATCH_LAUNCHER_USM(double, cusolverDnDorgqr) #undef ORGQR_BATCH_LAUNCHER_USM template -inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t batch_size, T *scratchpad, +inline sycl::event potrf_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, stride_a, batch_size, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); CUdeviceptr a_dev; cusolverStatus_t err; CUresult cuda_result; - auto *a_ = reinterpret_cast(a); + auto* a_ = reinterpret_cast(a); // Transform ptr and stride to list of ptr's - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); + auto** a_dev_ = reinterpret_cast(a_dev); CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), (int)n, a_dev_, (int)lda, nullptr, (int)batch_size); @@ -1231,10 +1231,10 @@ inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRF_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ + sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, stride_a, \ batch_size, scratchpad, scratchpad_size, dependencies); \ } @@ -1247,11 +1247,11 @@ POTRF_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZpotrfBatched) #undef POTRF_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, T **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, +inline sycl::event potrf_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, T** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; int64_t batch_size = 0; @@ -1260,19 +1260,19 @@ inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &qu batch_size += group_sizes[i]; } - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; CUdeviceptr a_dev; CUresult cuda_result; cusolverStatus_t err; - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a, sizeof(T *) * batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a, sizeof(T*) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); + auto** a_dev_ = reinterpret_cast(a_dev); // Does not use scratch so call cuSolver asynchronously and sync at end for (int64_t i = 0; i < group_count; i++) { @@ -1292,9 +1292,9 @@ inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRF_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event potrf_batch( \ - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, TYPE **a, std::int64_t *lda, \ - std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad, \ - std::int64_t scratchpad_size, const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, TYPE** a, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes, TYPE* scratchpad, \ + std::int64_t scratchpad_size, const std::vector& dependencies) { \ return potrf_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -1307,12 +1307,12 @@ POTRF_BATCH_LAUNCHER_USM(std::complex, cusolverDnZpotrfBatched) #undef POTRF_BATCH_LAUNCHER_USM template -inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a, - std::int64_t lda, std::int64_t stride_a, T *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, T *scratchpad, +inline sycl::event potrs_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T* a, + std::int64_t lda, std::int64_t stride_a, T* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, stride_a, stride_b, batch_size, scratchpad_size); @@ -1321,26 +1321,26 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu if (nrhs != 1) throw unimplemented("lapack", "potrs_batch", "cusolver potrs_batch only supports nrhs = 1"); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); CUresult cuda_result; CUdeviceptr a_dev, b_dev; - auto *a_ = reinterpret_cast(a); - auto *b_ = reinterpret_cast(b); + auto* a_ = reinterpret_cast(a); + auto* b_ = reinterpret_cast(b); cusolverStatus_t err; // Transform ptr and stride to list of ptr's - cuDataType **a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); - cuDataType **b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T *) * batch_size); - CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T *) * batch_size); + cuDataType** a_batched = create_ptr_list_from_stride(a_, stride_a, batch_size); + cuDataType** b_batched = create_ptr_list_from_stride(b_, stride_b, batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &a_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemAlloc, cuda_result, &b_dev, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, a_dev, a_batched, sizeof(T*) * batch_size); + CUDA_ERROR_FUNC(cuMemcpyHtoD, cuda_result, b_dev, b_batched, sizeof(T*) * batch_size); - auto **a_dev_ = reinterpret_cast(a_dev); - auto **b_dev_ = reinterpret_cast(b_dev); + auto** a_dev_ = reinterpret_cast(a_dev); + auto** b_dev_ = reinterpret_cast(b_dev); CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), (int)n, (int)nrhs, a_dev_, (int)lda, b_dev_, ldb, nullptr, @@ -1357,10 +1357,10 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRS_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event potrs_batch( \ - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, TYPE *a, \ - std::int64_t lda, std::int64_t stride_a, TYPE *b, std::int64_t ldb, std::int64_t stride_b, \ - std::int64_t batch_size, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, TYPE* a, \ + std::int64_t lda, std::int64_t stride_a, TYPE* b, std::int64_t ldb, std::int64_t stride_b, \ + std::int64_t batch_size, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, \ stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, \ dependencies); \ @@ -1374,12 +1374,12 @@ POTRS_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZpotrsBatched) #undef POTRS_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, T **a, - std::int64_t *lda, T **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, +inline sycl::event potrs_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, T** a, + std::int64_t* lda, T** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; int64_t batch_size = 0; @@ -1393,29 +1393,29 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu "cusolver potrs_batch only supports nrhs = 1"); } - int *info = (int *)malloc_device(sizeof(int *) * batch_size, queue); - T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue); - T **b_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue); + int* info = (int*)malloc_device(sizeof(int*) * batch_size, queue); + T** a_dev = (T**)malloc_device(sizeof(T*) * batch_size, queue); + T** b_dev = (T**)malloc_device(sizeof(T*) * batch_size, queue); auto done_cpy_a = - queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); }); + queue.submit([&](sycl::handler& h) { h.memcpy(a_dev, a, batch_size * sizeof(T*)); }); auto done_cpy_b = - queue.submit([&](sycl::handler &h) { h.memcpy(b_dev, b, batch_size * sizeof(T *)); }); + queue.submit([&](sycl::handler& h) { h.memcpy(b_dev, b, batch_size * sizeof(T*)); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.depends_on(done_cpy_a); cgh.depends_on(done_cpy_b); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; cusolverStatus_t err; // Does not use scratch so call cuSolver asynchronously and sync at end for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a_dev); - auto **b_ = reinterpret_cast(b_dev); - auto info_ = reinterpret_cast(info); + auto** a_ = reinterpret_cast(a_dev); + auto** b_ = reinterpret_cast(b_dev); + auto info_ = reinterpret_cast(info); CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo[i]), (int)n[i], (int)nrhs[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i], info_, (int)group_sizes[i]); @@ -1430,10 +1430,10 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRS_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ sycl::event potrs_batch( \ - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, \ - TYPE **a, std::int64_t *lda, TYPE **b, std::int64_t *ldb, std::int64_t group_count, \ - std::int64_t *group_sizes, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, \ + TYPE** a, std::int64_t* lda, TYPE** b, std::int64_t* ldb, std::int64_t group_count, \ + std::int64_t* group_sizes, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrs_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, \ ldb, group_count, group_sizes, scratchpad, scratchpad_size, \ dependencies); \ @@ -1447,22 +1447,22 @@ POTRS_BATCH_LAUNCHER_USM(std::complex, cusolverDnZpotrsBatched) #undef POTRS_BATCH_LAUNCHER_USM template -inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, - std::int64_t stride_a, T *tau, std::int64_t stride_tau, - std::int64_t batch_size, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event ungqr_batch(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, + std::int64_t stride_a, T* tau, std::int64_t stride_tau, + std::int64_t batch_size, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, stride_a, stride_tau, batch_size, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; // Uses scratch so sync between each cuSolver call @@ -1478,11 +1478,11 @@ inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &qu } #define UNGQR_STRIDED_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - TYPE *a, std::int64_t lda, std::int64_t stride_a, TYPE *tau, \ - std::int64_t stride_tau, std::int64_t batch_size, TYPE *scratchpad, \ + sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + TYPE* a, std::int64_t lda, std::int64_t stride_a, TYPE* tau, \ + std::int64_t stride_tau, std::int64_t batch_size, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, stride_a, \ tau, stride_tau, batch_size, scratchpad, scratchpad_size, \ dependencies); \ @@ -1494,25 +1494,25 @@ UNGQR_STRIDED_BATCH_LAUNCHER_USM(std::complex, cusolverDnZungqr) #undef UNGQR_STRIDED_BATCH_LAUNCHER_USM template -inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, T **a, - std::int64_t *lda, T **tau, std::int64_t group_count, - std::int64_t *group_sizes, T *scratchpad, +inline sycl::event ungqr_batch(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, T** a, + std::int64_t* lda, T** tau, std::int64_t group_count, + std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(group_count, scratchpad_size); for (int64_t i = 0; i < group_count; ++i) overflow_check(m[i], n[i], k[i], lda[i], group_sizes[i]); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); int64_t global_id = 0; cusolverStatus_t err; @@ -1533,11 +1533,11 @@ inline sycl::event ungqr_batch(const char *func_name, Func func, sycl::queue &qu } #define UNGQR_BATCH_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, \ - TYPE **a, std::int64_t *lda, TYPE **tau, std::int64_t group_count, \ - std::int64_t *group_sizes, TYPE *scratchpad, \ + sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, \ + TYPE** a, std::int64_t* lda, TYPE** tau, std::int64_t group_count, \ + std::int64_t* group_sizes, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ungqr_batch(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -1550,12 +1550,12 @@ UNGQR_BATCH_LAUNCHER_USM(std::complex, cusolverDnZungqr) // BATCH SCRATCHPAD API template -inline void getrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void getrf_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, - std::int64_t batch_size, int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t batch_size, int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; @@ -1618,12 +1618,12 @@ GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex) #undef GETRS_STRIDED_BATCH_LAUNCHER_SCRATCH template -inline void geqrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void geqrf_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, - std::int64_t batch_size, int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t batch_size, int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; @@ -1685,13 +1685,13 @@ POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex) #undef POTRS_STRIDED_BATCH_LAUNCHER_SCRATCH template -inline void orgqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void orgqr_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size, - int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; @@ -1719,13 +1719,13 @@ ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize) #undef ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH template -inline void ungqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ungqr_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size, - int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; @@ -1753,12 +1753,12 @@ ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH(std::complex, cusolverDnZungqr_buff #undef ORGQR_STRIDED_BATCH_LAUNCHER_SCRATCH template -inline void getrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { +inline void getrf_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int group_scratch_size = 0; *scratch_size = 0; @@ -1779,8 +1779,8 @@ inline void getrf_batch_scratchpad_size(const char *func_name, Func func, sycl:: #define GETRF_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ template <> \ std::int64_t getrf_batch_scratchpad_size( \ - sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * lda, \ - std::int64_t group_count, std::int64_t * group_sizes) { \ + sycl::queue & queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ int scratch_size; \ getrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \ group_count, group_sizes, &scratch_size); \ @@ -1794,18 +1794,18 @@ GETRF_GROUP_LAUNCHER_SCRATCH(std::complex, cusolverDnZgetrf_bufferSize) #undef GETRF_GROUP_LAUNCHER_SCRATCH -#define GETRI_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t getri_batch_scratchpad_size(sycl::queue & queue, std::int64_t * n, \ - std::int64_t * lda, std::int64_t group_count, \ - std::int64_t * group_sizes) { \ - std::int64_t max_scratch_sz = 0; \ - for (auto group_id = 0; group_id < group_count; ++group_id) { \ - auto scratch_sz = lda[group_id] * n[group_id]; \ - if (scratch_sz > max_scratch_sz) \ - max_scratch_sz = scratch_sz; \ - } \ - return max_scratch_sz; \ +#define GETRI_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t getri_batch_scratchpad_size(sycl::queue & queue, std::int64_t* n, \ + std::int64_t* lda, std::int64_t group_count, \ + std::int64_t* group_sizes) { \ + std::int64_t max_scratch_sz = 0; \ + for (auto group_id = 0; group_id < group_count; ++group_id) { \ + auto scratch_sz = lda[group_id] * n[group_id]; \ + if (scratch_sz > max_scratch_sz) \ + max_scratch_sz = scratch_sz; \ + } \ + return max_scratch_sz; \ } GETRI_GROUP_LAUNCHER_SCRATCH(float) @@ -1815,13 +1815,13 @@ GETRI_GROUP_LAUNCHER_SCRATCH(std::complex) #undef GETRI_GROUP_LAUNCHER_SCRATCH -#define GETRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t getrs_batch_scratchpad_size( \ - sycl::queue & queue, oneapi::mkl::transpose * trans, std::int64_t * n, \ - std::int64_t * nrhs, std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count, \ - std::int64_t * group_sizes) { \ - return 0; \ +#define GETRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t getrs_batch_scratchpad_size( \ + sycl::queue & queue, oneapi::mkl::transpose * trans, std::int64_t* n, std::int64_t* nrhs, \ + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, \ + std::int64_t* group_sizes) { \ + return 0; \ } GETRS_GROUP_LAUNCHER_SCRATCH(float) @@ -1832,12 +1832,12 @@ GETRS_GROUP_LAUNCHER_SCRATCH(std::complex) #undef GETRS_GROUP_LAUNCHER_SCRATCH template -inline void geqrf_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { +inline void geqrf_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int group_scratch_size = 0; *scratch_size = 0; @@ -1858,8 +1858,8 @@ inline void geqrf_batch_scratchpad_size(const char *func_name, Func func, sycl:: #define GEQRF_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ template <> \ std::int64_t geqrf_batch_scratchpad_size( \ - sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * lda, \ - std::int64_t group_count, std::int64_t * group_sizes) { \ + sycl::queue & queue, std::int64_t* m, std::int64_t* n, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ int scratch_size; \ geqrf_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, lda, \ group_count, group_sizes, &scratch_size); \ @@ -1874,12 +1874,12 @@ GEQRF_GROUP_LAUNCHER_SCRATCH(std::complex, cusolverDnZgeqrf_bufferSize) #undef GEQRF_GROUP_LAUNCHER_SCRATCH template -inline void orgqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { +inline void orgqr_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int group_scratch_size = 0; *scratch_size = 0; @@ -1898,15 +1898,15 @@ inline void orgqr_batch_scratchpad_size(const char *func_name, Func func, sycl:: e.wait(); } -#define ORGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ - template <> \ - std::int64_t orgqr_batch_scratchpad_size( \ - sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * k, \ - std::int64_t * lda, std::int64_t group_count, std::int64_t * group_sizes) { \ - int scratch_size; \ - orgqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \ - group_count, group_sizes, &scratch_size); \ - return scratch_size; \ +#define ORGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ + template <> \ + std::int64_t orgqr_batch_scratchpad_size( \ + sycl::queue & queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ + int scratch_size; \ + orgqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \ + group_count, group_sizes, &scratch_size); \ + return scratch_size; \ } ORGQR_GROUP_LAUNCHER_SCRATCH(float, cusolverDnSorgqr_bufferSize) @@ -1915,12 +1915,12 @@ ORGQR_GROUP_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize) #undef ORGQR_GROUP_LAUNCHER_SCRATCH // cusolverDnXpotrfBatched does not use scratchpad memory -#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t potrf_batch_scratchpad_size( \ - sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * lda, \ - std::int64_t group_count, std::int64_t * group_sizes) { \ - return 0; \ +#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t potrf_batch_scratchpad_size( \ + sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t* n, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ + return 0; \ } POTRF_GROUP_LAUNCHER_SCRATCH(float) @@ -1931,13 +1931,13 @@ POTRF_GROUP_LAUNCHER_SCRATCH(std::complex) #undef POTRF_GROUP_LAUNCHER_SCRATCH // cusolverDnXpotrsBatched does not use scratchpad memory -#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t potrs_batch_scratchpad_size( \ - sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * nrhs, \ - std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count, \ - std::int64_t * group_sizes) { \ - return 0; \ +#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t potrs_batch_scratchpad_size( \ + sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t* n, std::int64_t* nrhs, \ + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, \ + std::int64_t* group_sizes) { \ + return 0; \ } POTRS_GROUP_LAUNCHER_SCRATCH(float) @@ -1948,12 +1948,12 @@ POTRS_GROUP_LAUNCHER_SCRATCH(std::complex) #undef POTRS_GROUP_LAUNCHER_SCRATCH template -inline void ungqr_batch_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, int *scratch_size) { - auto e = queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { +inline void ungqr_batch_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, int* scratch_size) { + auto e = queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int group_scratch_size = 0; *scratch_size = 0; @@ -1972,15 +1972,15 @@ inline void ungqr_batch_scratchpad_size(const char *func_name, Func func, sycl:: e.wait(); } -#define UNGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ - template <> \ - std::int64_t ungqr_batch_scratchpad_size( \ - sycl::queue & queue, std::int64_t * m, std::int64_t * n, std::int64_t * k, \ - std::int64_t * lda, std::int64_t group_count, std::int64_t * group_sizes) { \ - int scratch_size; \ - ungqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \ - group_count, group_sizes, &scratch_size); \ - return scratch_size; \ +#define UNGQR_GROUP_LAUNCHER_SCRATCH(TYPE, CUSOLVER_ROUTINE) \ + template <> \ + std::int64_t ungqr_batch_scratchpad_size( \ + sycl::queue & queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ + int scratch_size; \ + ungqr_batch_scratchpad_size(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, lda, \ + group_count, group_sizes, &scratch_size); \ + return scratch_size; \ } UNGQR_GROUP_LAUNCHER_SCRATCH(std::complex, cusolverDnCungqr_bufferSize) diff --git a/src/lapack/backends/cusolver/cusolver_handle.hpp b/src/lapack/backends/cusolver/cusolver_handle.hpp index f3b587039..75d589b06 100644 --- a/src/lapack/backends/cusolver/cusolver_handle.hpp +++ b/src/lapack/backends/cusolver/cusolver_handle.hpp @@ -28,10 +28,10 @@ namespace cusolver { template struct cusolver_handle { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t cusolver_handle_mapper_{}; ~cusolver_handle() noexcept(false) { - for (auto &handle_pair : cusolver_handle_mapper_) { + for (auto& handle_pair : cusolver_handle_mapper_) { cusolverStatus_t err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); diff --git a/src/lapack/backends/cusolver/cusolver_helper.hpp b/src/lapack/backends/cusolver/cusolver_helper.hpp index e10f56b36..f674f06a4 100644 --- a/src/lapack/backends/cusolver/cusolver_helper.hpp +++ b/src/lapack/backends/cusolver/cusolver_helper.hpp @@ -82,7 +82,7 @@ void overflow_check(Index index, Next... indices) { class cusolver_error : virtual public std::runtime_error { protected: - inline const char *cusolver_error_map(cusolverStatus_t error) { + inline const char* cusolver_error_map(cusolverStatus_t error) { switch (error) { case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS"; @@ -131,7 +131,7 @@ class cusolver_error : virtual public std::runtime_error { class cuda_error : virtual public std::runtime_error { protected: - inline const char *cuda_error_map(CUresult result) { + inline const char* cuda_error_map(CUresult result) { switch (result) { case CUDA_SUCCESS: return "CUDA_SUCCESS"; case CUDA_ERROR_NOT_PERMITTED: return "CUDA_ERROR_NOT_PERMITTED"; @@ -280,25 +280,25 @@ struct CudaEquivalentType> { /* devinfo */ -inline void get_cusolver_devinfo(sycl::queue &queue, sycl::buffer &devInfo, - std::vector &dev_info_) { +inline void get_cusolver_devinfo(sycl::queue& queue, sycl::buffer& devInfo, + std::vector& dev_info_) { sycl::host_accessor dev_info_acc{ devInfo }; for (unsigned int i = 0; i < dev_info_.size(); ++i) dev_info_[i] = dev_info_acc[i]; } -inline void get_cusolver_devinfo(sycl::queue &queue, const int *devInfo, - std::vector &dev_info_) { +inline void get_cusolver_devinfo(sycl::queue& queue, const int* devInfo, + std::vector& dev_info_) { queue.wait(); queue.memcpy(dev_info_.data(), devInfo, sizeof(int)); } template -inline void lapack_info_check(sycl::queue &queue, DEVINFO_T devinfo, const char *func_name, - const char *cufunc_name, int dev_info_size = 1) { +inline void lapack_info_check(sycl::queue& queue, DEVINFO_T devinfo, const char* func_name, + const char* cufunc_name, int dev_info_size = 1) { std::vector dev_info_(dev_info_size); get_cusolver_devinfo(queue, devinfo, dev_info_); - for (const auto &val : dev_info_) { + for (const auto& val : dev_info_) { if (val > 0) throw oneapi::mkl::lapack::computation_error( func_name, std::string(cufunc_name) + " failed with info = " + std::to_string(val), @@ -311,8 +311,8 @@ inline void lapack_info_check(sycl::queue &queue, DEVINFO_T devinfo, const char // Creates list of matrix/vector pointers from initial ptr and stride // Note: user is responsible for deallocating memory template -T **create_ptr_list_from_stride(T *ptr, int64_t ptr_stride, int64_t batch_size) { - T **ptr_list = (T **)malloc(sizeof(T *) * batch_size); +T** create_ptr_list_from_stride(T* ptr, int64_t ptr_stride, int64_t batch_size) { + T** ptr_list = (T**)malloc(sizeof(T*) * batch_size); for (int64_t i = 0; i < batch_size; i++) ptr_list[i] = ptr + i * ptr_stride; diff --git a/src/lapack/backends/cusolver/cusolver_lapack.cpp b/src/lapack/backends/cusolver/cusolver_lapack.cpp index 0c7aaefc8..b60fec82e 100644 --- a/src/lapack/backends/cusolver/cusolver_lapack.cpp +++ b/src/lapack/backends/cusolver/cusolver_lapack.cpp @@ -30,10 +30,10 @@ namespace cusolver { // BUFFER APIs template -inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void gebrd(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -41,21 +41,21 @@ inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int if (m < n) throw unimplemented("lapack", "gebrd", "cusolver gebrd does not support m < n"); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tauq_acc = tauq.template get_access(cgh); auto taup_acc = taup.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tauq_ = sc.get_mem(tauq_acc); - auto taup_ = sc.get_mem(taup_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tauq_ = sc.get_mem(tauq_acc); + auto taup_ = sc.get_mem(taup_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_, taup_, scratch_, scratchpad_size, nullptr); @@ -64,10 +64,10 @@ inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int } #define GEBRD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, \ - sycl::buffer &tauq, sycl::buffer &taup, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, \ + sycl::buffer& tauq, sycl::buffer& taup, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ gebrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \ scratchpad, scratchpad_size); \ } @@ -79,43 +79,43 @@ GEBRD_LAUNCHER(std::complex, double, cusolverDnZgebrd) #undef GEBRD_LAUNCHER -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } template -inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void geqrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -124,8 +124,8 @@ inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int } #define GEQRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ geqrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -139,9 +139,9 @@ GEQRF_LAUNCHER(std::complex, cusolverDnZgeqrf) #undef GEQRF_LAUNCHER template -void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -152,17 +152,17 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv32_acc = ipiv32.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv32_ = sc.get_mem(ipiv32_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv32_ = sc.get_mem(ipiv32_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, scratch_, ipiv32_, devInfo_); @@ -170,7 +170,7 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, }); // Copy from 32-bit buffer to 64-bit - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { @@ -181,8 +181,8 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, } #define GETRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, \ + void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ getrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \ scratchpad_size); \ @@ -196,8 +196,8 @@ GETRF_LAUNCHER(std::complex, cusolverDnZgetrf) #undef GETRF_LAUNCHER #define GETRI_LAUNCHER(TYPE) \ - void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &ipiv, sycl::buffer &scratchpad, \ + void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& ipiv, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ return getri_batch(queue, n, a, lda, lda * n, ipiv, n, 1, scratchpad, scratchpad_size); \ } @@ -211,10 +211,10 @@ GETRI_LAUNCHER(std::complex) // cusolverDnXgetrs does not use scratchpad memory template -inline void getrs(const char *func_name, Func func, sycl::queue &queue, +inline void getrs(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb); @@ -225,7 +225,7 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, std::uint64_t ipiv_size = ipiv.size(); sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { @@ -233,15 +233,15 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv_acc = ipiv32.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv_ = sc.get_mem(ipiv_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv_ = sc.get_mem(ipiv_acc); + auto b_ = sc.get_mem(b_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), n, nrhs, a_, lda, ipiv_, b_, ldb, nullptr); @@ -250,10 +250,10 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, } #define GETRS_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ getrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, ldb, \ scratchpad, scratchpad_size); \ } @@ -266,30 +266,30 @@ GETRS_LAUNCHER(std::complex, cusolverDnZgetrs) #undef GETRS_LAUNCHER template -inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, +inline void gesvd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, m, lda, ldu, ldvt, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto s_acc = s.template get_access(cgh); auto u_acc = u.template get_access(cgh); auto vt_acc = vt.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto s_ = sc.get_mem(s_acc); - auto u_ = sc.get_mem(u_acc); - auto vt_ = sc.get_mem(vt_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto s_ = sc.get_mem(s_acc); + auto u_ = sc.get_mem(u_acc); + auto vt_ = sc.get_mem(vt_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; // rwork is set to nullptr. If set it is filled with information from the superdiagonal. CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_jobsvd(jobu), @@ -301,10 +301,10 @@ inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define GESVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, \ - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, \ + void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, \ + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ gesvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, ldu, \ vt, ldvt, scratchpad, scratchpad_size); \ @@ -318,25 +318,25 @@ GESVD_LAUNCHER(std::complex, double, cusolverDnZgesvd) #undef GESVD_LAUNCHER template -inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +inline void heevd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -347,9 +347,9 @@ inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define HEEVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ heevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \ scratchpad_size); \ } @@ -360,28 +360,28 @@ HEEVD_LAUNCHER(std::complex, double, cusolverDnZheevd) #undef HEEVD_LAUNCHER template -inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, +inline void hegvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype), get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, @@ -392,10 +392,10 @@ inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int } #define HEGVD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ hegvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, w, \ scratchpad, scratchpad_size); \ } @@ -406,29 +406,29 @@ HEGVD_LAUNCHER(std::complex, double, cusolverDnZhegvd) #undef HEGVD_LAUNCHER template -inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void hetrd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tau_ = sc.get_mem(tau_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tau_ = sc.get_mem(tau_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_); @@ -438,10 +438,10 @@ inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define HETRD_LAUNCHER(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, \ - sycl::buffer &e, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, \ + sycl::buffer& e, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ hetrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, scratchpad, \ scratchpad_size); \ } @@ -451,34 +451,34 @@ HETRD_LAUNCHER(std::complex, double, cusolverDnZhetrd) #undef HETRD_LAUNCHER -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "hetrf"); } -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "hetrf"); } template -inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void orgbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -487,9 +487,9 @@ inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORGBR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ orgbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -500,20 +500,20 @@ ORGBR_LAUNCHER(double, cusolverDnDorgbr) #undef ORGBR_LAUNCHER template -inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void orgqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -522,9 +522,9 @@ inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int } #define ORGQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ orgqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -535,20 +535,20 @@ ORGQR_LAUNCHER(double, cusolverDnDorgqr) #undef ORGQR_LAUNCHER template -inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void orgtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -557,8 +557,8 @@ inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORGTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ orgtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -570,24 +570,24 @@ ORGTR_LAUNCHER(double, cusolverDnDorgtr) #undef ORGTR_LAUNCHER template -inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void ormtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, @@ -598,10 +598,10 @@ inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORMTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ ormtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ @@ -612,37 +612,37 @@ ORMTR_LAUNCHER(double, cusolverDnDormtr) #undef ORMTR_LAUNCHER -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ormrq"); } -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ormrq"); } template -inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void ormqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc, @@ -652,10 +652,10 @@ inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORMQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ormqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ } @@ -666,21 +666,21 @@ ORMQR_LAUNCHER(double, cusolverDnDormqr) #undef ORMQR_LAUNCHER template -inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +inline void potrf(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, scratch_, scratchpad_size, devInfo_); @@ -690,8 +690,8 @@ inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size); \ } @@ -704,21 +704,21 @@ POTRF_LAUNCHER(std::complex, cusolverDnZpotrf) #undef POTRF_LAUNCHER template -inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +inline void potri(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, scratch_, scratchpad_size, devInfo_); @@ -728,8 +728,8 @@ inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRI_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potri(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size); \ } @@ -743,19 +743,19 @@ POTRI_LAUNCHER(std::complex, cusolverDnZpotri) // cusolverDnXpotrs does not use scratchpad memory template -inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +inline void potrs(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, nrhs, a_, lda, b_, ldb, nullptr); @@ -764,9 +764,9 @@ inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRS_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \ scratchpad, scratchpad_size); \ } @@ -779,23 +779,23 @@ POTRS_LAUNCHER(std::complex, cusolverDnZpotrs) #undef POTRS_LAUNCHER template -inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void syevd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -806,9 +806,9 @@ inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYEVD_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ syevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \ scratchpad_size); \ } @@ -819,26 +819,26 @@ SYEVD_LAUNCHER(double, cusolverDnDsyevd) #undef SYEVD_LAUNCHER template -inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void sygvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype), get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, @@ -849,10 +849,10 @@ inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int } #define SYGVD_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ sygvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, w, \ scratchpad, scratchpad_size); \ } @@ -863,28 +863,28 @@ SYGVD_LAUNCHER(double, cusolverDnDsygvd) #undef SYGVD_LAUNCH template -inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void sytrd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tau_ = sc.get_mem(tau_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tau_ = sc.get_mem(tau_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_); @@ -894,9 +894,9 @@ inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYTRD_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, \ - sycl::buffer &tau, sycl::buffer &scratchpad, \ + void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, \ + sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ sytrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, scratchpad, \ scratchpad_size); \ @@ -908,9 +908,9 @@ SYTRD_LAUNCHER(double, cusolverDnDsytrd) #undef SYTRD_LAUNCHER template -inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +inline void sytrf(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); @@ -922,17 +922,17 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: std::uint64_t ipiv_size = n; sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv32_acc = ipiv32.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv32_ = sc.get_mem(ipiv32_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv32_ = sc.get_mem(ipiv32_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, ipiv32_, scratch_, scratchpad_size, devInfo_); @@ -940,7 +940,7 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: }); // Copy from 32-bit buffer to 64-bit - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { @@ -951,8 +951,8 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYTRF_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, \ + void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ sytrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, scratchpad, \ scratchpad_size); \ @@ -965,49 +965,49 @@ SYTRF_LAUNCHER(std::complex, cusolverDnZsytrf) #undef SYTRF_LAUNCHER -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } template -inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void ungbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1016,9 +1016,9 @@ inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNGBR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ungbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -1029,20 +1029,20 @@ UNGBR_LAUNCHER(std::complex, cusolverDnZungbr) #undef UNGBR_LAUNCHER template -inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void ungqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1051,9 +1051,9 @@ inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int } #define UNGQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ungqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -1064,20 +1064,20 @@ UNGQR_LAUNCHER(std::complex, cusolverDnZungqr) #undef UNGQR_LAUNCHER template -inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void ungtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1086,8 +1086,8 @@ inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNGTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ ungtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -1098,39 +1098,39 @@ UNGTR_LAUNCHER(std::complex, cusolverDnZungtr) #undef UNGTR_LAUNCHER -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "unmrq"); } -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "unmrq"); } template -inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void unmqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc, @@ -1140,10 +1140,10 @@ inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNMQR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ unmqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ } @@ -1154,24 +1154,24 @@ UNMQR_LAUNCHER(std::complex, cusolverDnZunmqr) #undef UNMQR_LAUNCHER template -inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void unmtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); + auto scratch_ = sc.get_mem(scratch_acc); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, @@ -1182,10 +1182,10 @@ inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNMTR_LAUNCHER(TYPE, CUSOLVER_ROUTINE) \ - void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ unmtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ @@ -1199,10 +1199,10 @@ UNMTR_LAUNCHER(std::complex, cusolverDnZunmtr) // USM APIs template -inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T_A *a, std::int64_t lda, T_B *d, T_B *e, T_A *tauq, - T_A *taup, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event gebrd(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T_A* a, std::int64_t lda, T_B* d, T_B* e, T_A* tauq, + T_A* taup, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -1210,19 +1210,19 @@ inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, s if (m < n) throw unimplemented("lapack", "gebrd", "cusolver gebrd does not support m < n"); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tauq_ = reinterpret_cast(tauq); - auto taup_ = reinterpret_cast(taup); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tauq_ = reinterpret_cast(tauq); + auto taup_ = reinterpret_cast(taup); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_, taup_, scratch_, scratchpad_size, nullptr); @@ -1232,10 +1232,10 @@ inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, s } #define GEBRD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE_A *a, \ - std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tauq, TYPE_A *taup, \ - TYPE_A *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE_A* a, \ + std::int64_t lda, TYPE_B* d, TYPE_B* e, TYPE_A* tauq, TYPE_A* taup, \ + TYPE_A* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return gebrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1247,44 +1247,44 @@ GEBRD_LAUNCHER_USM(std::complex, double, cusolverDnZgebrd) #undef GEBRD_LAUNCHER_USM -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } template -inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, +inline sycl::event geqrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1294,9 +1294,9 @@ inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, s } #define GEQRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return geqrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1309,10 +1309,10 @@ GEQRF_LAUNCHER_USM(std::complex, cusolverDnZgeqrf) #undef GEQRF_LAUNCHER_USM template -inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad, +inline sycl::event getrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, std::int64_t* ipiv, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -1320,20 +1320,20 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s // To get around the limitation. // Allocate memory with 32-bit ints then copy over results std::uint64_t ipiv_size = std::min(n, m); - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); - auto ipiv_ = reinterpret_cast(ipiv32); + auto a_ = reinterpret_cast(a); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); + auto ipiv_ = reinterpret_cast(ipiv32); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, scratch_, ipiv_, devInfo_); @@ -1341,7 +1341,7 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s }); // Copy from 32-bit USM to 64-bit - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv[index] = static_cast(ipiv32[index]); @@ -1358,10 +1358,10 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s } #define GETRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad, \ + sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t* ipiv, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return getrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1374,9 +1374,9 @@ GETRF_LAUNCHER_USM(std::complex, cusolverDnZgetrf) #undef GETRF_LAUNCHER_USM #define GETRI_LAUNCHER_USM(TYPE) \ - sycl::event getri(sycl::queue &queue, std::int64_t n, TYPE *a, std::int64_t lda, \ - std::int64_t *ipiv, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event getri(sycl::queue& queue, std::int64_t n, TYPE* a, std::int64_t lda, \ + std::int64_t* ipiv, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return getri_batch(queue, n, a, lda, lda * n, ipiv, n, 1, scratchpad, scratchpad_size, \ dependencies); \ } @@ -1390,11 +1390,11 @@ GETRI_LAUNCHER_USM(std::complex) // cusolverDnXgetrs does not use scratchpad memory template -inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a, - std::int64_t lda, std::int64_t *ipiv, T *b, std::int64_t ldb, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event getrs(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T* a, + std::int64_t lda, std::int64_t* ipiv, T* b, std::int64_t ldb, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); @@ -1402,25 +1402,25 @@ inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, // To get around the limitation. // Create new buffer and convert 64-bit values. std::uint64_t ipiv_size = n; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv32[index] = static_cast(ipiv[index]); }); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } cgh.depends_on(done_casting); - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto ipiv_ = reinterpret_cast(ipiv32); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto ipiv_ = reinterpret_cast(ipiv32); + auto b_ = reinterpret_cast(b); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_operation(trans), n, nrhs, a_, lda, ipiv_, b_, ldb, nullptr); @@ -1435,10 +1435,10 @@ inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, } #define GETRS_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t *ipiv, TYPE *b, \ - std::int64_t ldb, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, TYPE* a, std::int64_t lda, std::int64_t* ipiv, TYPE* b, \ + std::int64_t ldb, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return getrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, \ ldb, scratchpad, scratchpad_size, dependencies); \ } @@ -1451,28 +1451,28 @@ GETRS_LAUNCHER_USM(std::complex, cusolverDnZgetrs) #undef GETRS_LAUNCHER_USM template -inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event gesvd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, T_A *a, std::int64_t lda, T_B *s, T_A *u, std::int64_t ldu, - T_A *vt, std::int64_t ldvt, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, T_A* a, std::int64_t lda, T_B* s, T_A* u, std::int64_t ldu, + T_A* vt, std::int64_t ldvt, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldu, ldvt, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto s_ = reinterpret_cast(s); - auto u_ = reinterpret_cast(u); - auto vt_ = reinterpret_cast(vt); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto s_ = reinterpret_cast(s); + auto u_ = reinterpret_cast(u); + auto vt_ = reinterpret_cast(vt); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; // rwork is set to nullptr. If set it is filled with information from the superdiagonal. CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_jobsvd(jobu), @@ -1486,11 +1486,11 @@ inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue, } #define GESVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ - std::int64_t m, std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *s, \ - TYPE_A *u, std::int64_t ldu, TYPE_A *vt, std::int64_t ldvt, \ - TYPE_A *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ + std::int64_t m, std::int64_t n, TYPE_A* a, std::int64_t lda, TYPE_B* s, \ + TYPE_A* u, std::int64_t ldu, TYPE_A* vt, std::int64_t ldvt, \ + TYPE_A* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return gesvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, \ ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); \ } @@ -1503,25 +1503,25 @@ GESVD_LAUNCHER_USM(std::complex, double, cusolverDnZgesvd) #undef GESVD_LAUNCHER_USM template -inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a, - std::int64_t lda, T_B *&w, T_A *&scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event heevd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A*& a, + std::int64_t lda, T_B*& w, T_A*& scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -1534,10 +1534,10 @@ inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue, } #define HEEVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ - std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *w, TYPE_A *scratchpad, \ + sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ + std::int64_t n, TYPE_A* a, std::int64_t lda, TYPE_B* w, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return heevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1548,27 +1548,27 @@ HEEVD_LAUNCHER_USM(std::complex, double, cusolverDnZheevd) #undef HEEVD_LAUNCHER_USM template -inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a, - std::int64_t lda, T_A *&b, std::int64_t ldb, T_B *&w, T_A *&scratchpad, +inline sycl::event hegvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A*& a, + std::int64_t lda, T_A*& b, std::int64_t ldb, T_B*& w, T_A*& scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype), get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, @@ -1581,11 +1581,11 @@ inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, s } #define HEGVD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, std::int64_t lda, \ - TYPE_A *b, std::int64_t ldb, TYPE_B *w, TYPE_A *scratchpad, \ + sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A* a, std::int64_t lda, \ + TYPE_A* b, std::int64_t ldb, TYPE_B* w, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return hegvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, \ ldb, w, scratchpad, scratchpad_size, dependencies); \ } @@ -1596,27 +1596,27 @@ HEGVD_LAUNCHER_USM(std::complex, double, cusolverDnZhegvd) #undef HEGVD_LAUNCHER_USM template -inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T_A *a, std::int64_t lda, T_B *d, - T_B *e, T_A *tau, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event hetrd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T_A* a, std::int64_t lda, T_B* d, + T_B* e, T_A* tau, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType_A = typename CudaEquivalentType::Type; using cuDataType_B = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tau_ = reinterpret_cast(tau); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tau_ = reinterpret_cast(tau); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_); @@ -1628,10 +1628,10 @@ inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue, } #define HETRD_LAUNCHER_USM(TYPE_A, TYPE_B, CUSOLVER_ROUTINE) \ - sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, \ - std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tau, TYPE_A *scratchpad, \ + sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A* a, \ + std::int64_t lda, TYPE_B* d, TYPE_B* e, TYPE_A* tau, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return hetrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1641,37 +1641,37 @@ HETRD_LAUNCHER_USM(std::complex, double, cusolverDnZhetrd) #undef HETRD_LAUNCHER_USM -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "hetrf"); } -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "hetrf"); } template -inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event orgbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - T *a, std::int64_t lda, T *tau, T *scratchpad, + T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1681,10 +1681,10 @@ inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue, } #define ORGBR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, \ - std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, \ + std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, TYPE* tau, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1695,22 +1695,22 @@ ORGBR_LAUNCHER_USM(double, cusolverDnDorgbr) #undef ORGBR_LAUNCHER_USM template -inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event orgqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1720,9 +1720,9 @@ inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, s } #define ORGQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1733,22 +1733,22 @@ ORGQR_LAUNCHER_USM(double, cusolverDnDorgqr) #undef ORGQR_LAUNCHER_USM template -inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event orgtr(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -1758,9 +1758,9 @@ inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue, } #define ORGTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1771,25 +1771,25 @@ ORGTR_LAUNCHER_USM(double, cusolverDnDorgtr) #undef ORGTR_LAUNCHER_USM template -inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ormtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, - std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T* a, + std::int64_t lda, T* tau, T* c, std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, @@ -1801,11 +1801,11 @@ inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue, } #define ORMTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ormtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -1815,38 +1815,38 @@ ORMTR_LAUNCHER_USM(double, cusolverDnDormtr) #undef ORMTR_LAUNCHER_USM -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "ormrq"); } -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "ormrq"); } template -inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ormqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, - std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, T* c, + std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc, @@ -1857,11 +1857,11 @@ inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue, } #define ORMQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \ - TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, \ + TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ormqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -1872,23 +1872,23 @@ ORMQR_LAUNCHER_USM(double, cusolverDnDormqr) #undef ORMQR_LAUNCHER_USM template -inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event potrf(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, scratch_, scratchpad_size, devInfo_); @@ -1900,9 +1900,9 @@ inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue, } #define POTRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1915,23 +1915,23 @@ POTRF_LAUNCHER_USM(std::complex, cusolverDnZpotrf) #undef POTRF_LAUNCHER_USM template -inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event potri(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto scratch_ = reinterpret_cast(scratchpad); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto scratch_ = reinterpret_cast(scratchpad); + auto devInfo_ = reinterpret_cast(devInfo); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, scratch_, scratchpad_size, devInfo_); @@ -1943,9 +1943,9 @@ inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue, } #define POTRI_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potri(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1959,22 +1959,22 @@ POTRI_LAUNCHER_USM(std::complex, cusolverDnZpotri) // cusolverDnXpotrs does not use scratchpad memory template -inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a, - std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad, +inline sycl::event potrs(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T* a, + std::int64_t lda, T* b, std::int64_t ldb, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, nrhs, a_, lda, b_, ldb, nullptr); @@ -1984,10 +1984,10 @@ inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue, } #define POTRS_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - std::int64_t nrhs, TYPE *a, std::int64_t lda, TYPE *b, std::int64_t ldb, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + std::int64_t nrhs, TYPE* a, std::int64_t lda, TYPE* b, std::int64_t ldb, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrs(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2000,24 +2000,24 @@ POTRS_LAUNCHER_USM(std::complex, cusolverDnZpotrs) #undef POTRS_LAUNCHER_USM template -inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a, - std::int64_t lda, T *w, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event syevd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T* a, + std::int64_t lda, T* w, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto w_ = reinterpret_cast(w); - auto scratch_ = reinterpret_cast(scratchpad); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto w_ = reinterpret_cast(w); + auto scratch_ = reinterpret_cast(scratchpad); + auto devInfo_ = reinterpret_cast(devInfo); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -2030,10 +2030,10 @@ inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue, } #define SYEVD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ - std::int64_t n, TYPE *a, std::int64_t lda, TYPE *w, TYPE *scratchpad, \ + sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ + std::int64_t n, TYPE* a, std::int64_t lda, TYPE* w, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return syevd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2044,26 +2044,26 @@ SYEVD_LAUNCHER_USM(double, cusolverDnDsyevd) #undef SYEVD_LAUNCHER_USM template -inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a, - std::int64_t lda, T *b, std::int64_t ldb, T *w, T *scratchpad, +inline sycl::event sygvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T* a, + std::int64_t lda, T* b, std::int64_t ldb, T* w, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cusolver_itype(itype), get_cusolver_job(jobz), get_cublas_fill_mode(uplo), n, a_, @@ -2076,10 +2076,10 @@ inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, s } #define SYGVD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, std::int64_t lda, TYPE *b, \ - std::int64_t ldb, TYPE *w, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, std::int64_t lda, TYPE* b, \ + std::int64_t ldb, TYPE* w, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return sygvd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, \ ldb, w, scratchpad, scratchpad_size, dependencies); \ } @@ -2090,26 +2090,26 @@ SYGVD_LAUNCHER_USM(double, cusolverDnDsygvd) #undef SYGVD_LAUNCHER_USM template -inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *d, T *e, - T *tau, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event sytrd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* d, T* e, + T* tau, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tau_ = reinterpret_cast(tau); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tau_ = reinterpret_cast(tau); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, d_, e_, tau_, scratch_, scratchpad_size, devInfo_); @@ -2121,10 +2121,10 @@ inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue, } #define SYTRD_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *d, TYPE *e, TYPE *tau, TYPE *scratchpad, \ + sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* d, TYPE* e, TYPE* tau, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return sytrd(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2135,31 +2135,31 @@ SYTRD_LAUNCHER_USM(double, cusolverDnDsytrd) #undef SYTRD_LAUNCHER_USM template -inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event sytrf(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + std::int64_t* ipiv, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); + int* devInfo = (int*)malloc_device(sizeof(int), queue); // cuSolver legacy api does not accept 64-bit ints. // To get around the limitation. // Allocate memory with 32-bit ints then copy over results std::uint64_t ipiv_size = n; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto scratch_ = reinterpret_cast(scratchpad); - auto ipiv_ = reinterpret_cast(ipiv32); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto scratch_ = reinterpret_cast(scratchpad); + auto ipiv_ = reinterpret_cast(ipiv32); + auto devInfo_ = reinterpret_cast(devInfo); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, ipiv_, scratch_, scratchpad_size, devInfo_); @@ -2167,7 +2167,7 @@ inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, }); // Copy from 32-bit USM to 64-bit - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv[index] = static_cast(ipiv32[index]); @@ -2184,10 +2184,10 @@ inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, } #define SYTRF_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad, \ + sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t* ipiv, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return sytrf(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2199,51 +2199,51 @@ SYTRF_LAUNCHER_USM(std::complex, cusolverDnZsytrf) #undef SYTRF_LAUNCHER_USM -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } template -inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ungbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - T *a, std::int64_t lda, T *tau, T *scratchpad, + T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -2253,10 +2253,10 @@ inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue, } #define UNGBR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, \ - std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, \ + std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, TYPE* tau, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungbr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2267,22 +2267,22 @@ UNGBR_LAUNCHER_USM(std::complex, cusolverDnZungbr) #undef UNGBR_LAUNCHER_USM template -inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event ungqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -2292,9 +2292,9 @@ inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, s } #define UNGQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -2305,22 +2305,22 @@ UNGQR_LAUNCHER_USM(std::complex, cusolverDnZungqr) #undef UNGQR_LAUNCHER_USM template -inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event ungtr(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, a_, lda, tau_, scratch_, scratchpad_size, nullptr); @@ -2330,9 +2330,9 @@ inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue, } #define UNGTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -2342,40 +2342,40 @@ UNGTR_LAUNCHER_USM(std::complex, cusolverDnZungtr) #undef UNGTR_LAUNCHER_USM -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "unmrq"); } -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "unmrq"); } template -inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event unmqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, - std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, T* c, + std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_operation(trans), m, n, k, a_, lda, tau_, c_, ldc, @@ -2386,11 +2386,11 @@ inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue, } #define UNMQR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \ - TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, \ + TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return unmqr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -2401,25 +2401,25 @@ UNMQR_LAUNCHER_USM(std::complex, cusolverDnZunmqr) #undef UNMQR_LAUNCHER_USM template -inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event unmtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, - std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T* a, + std::int64_t lda, T* tau, T* c, std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using cuDataType = typename CudaEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); + auto scratch_ = reinterpret_cast(scratchpad); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_cublas_side_mode(side), get_cublas_fill_mode(uplo), get_cublas_operation(trans), m, @@ -2431,11 +2431,11 @@ inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue, } #define UNMTR_LAUNCHER_USM(TYPE, CUSOLVER_ROUTINE) \ - sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return unmtr(#CUSOLVER_ROUTINE, CUSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -2448,11 +2448,11 @@ UNMTR_LAUNCHER_USM(std::complex, cusolverDnZunmtr) // SCRATCHPAD APIs template -inline void gebrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void gebrd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, scratch_size); @@ -2479,32 +2479,32 @@ GEBRD_LAUNCHER_SCRATCH(std::complex, cusolverDnZgebrd_bufferSize) #undef GEBRD_LAUNCHER_SCRATCH template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template -inline void geqrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void geqrf_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size); @@ -2531,12 +2531,12 @@ GEQRF_LAUNCHER_SCRATCH(std::complex, cusolverDnZgeqrf_bufferSize) #undef GEQRF_LAUNCHER_SCRATCH template -inline void gesvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void gesvd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, - std::int64_t ldu, std::int64_t ldvt, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t ldu, std::int64_t ldvt, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, scratch_size); @@ -2564,11 +2564,11 @@ GESVD_LAUNCHER_SCRATCH(std::complex, cusolverDnZgesvd_bufferSize) #undef GESVD_LAUNCHER_SCRATCH template -inline void getrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void getrf_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, nullptr, lda, scratch_size); @@ -2625,11 +2625,11 @@ GETRS_LAUNCHER_SCRATCH(std::complex) #undef GETRS_LAUNCHER_SCRATCH template -inline void heevd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void heevd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t lda, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t lda, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_job(jobz), @@ -2657,12 +2657,12 @@ HEEVD_LAUNCHER_SCRATCH(std::complex, cusolverDnZheevd_bufferSize) #undef HEEVD_LAUNCHER_SCRATCH template -inline void hegvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void hegvd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_itype(itype), @@ -2690,11 +2690,11 @@ HEGVD_LAUNCHER_SCRATCH(std::complex, cusolverDnZhegvd_bufferSize) #undef HEGVD_LAUNCHER_SCRATCH template -inline void hetrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void hetrd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -2720,22 +2720,22 @@ HETRD_LAUNCHER_SCRATCH(std::complex, cusolverDnZhetrd_bufferSize) #undef HETRD_LAUNCHER_SCRATCH template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "hetrf_scratchpad_size"); } template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "hetrf_scratchpad_size"); } template -inline void orgbr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void orgbr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::int64_t lda, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t k, std::int64_t lda, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, @@ -2762,11 +2762,11 @@ ORGBR_LAUNCHER_SCRATCH(double, cusolverDnDorgbr_bufferSize) #undef ORGBR_LAUNCHER_SCRATCH template -inline void orgtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void orgtr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -2792,11 +2792,11 @@ ORGTR_LAUNCHER_SCRATCH(double, cusolverDnDorgtr_bufferSize) #undef ORGTR_LAUNCHER_SCRATCH template -inline void orgqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void orgqr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr, @@ -2822,14 +2822,14 @@ ORGQR_LAUNCHER_SCRATCH(double, cusolverDnDorgqr_bufferSize) #undef ORGQR_LAUNCHER_SCRATCH template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { throw unimplemented("lapack", "ormrq_scratchpad_size"); } template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2837,12 +2837,12 @@ std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side } template -inline void ormqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ormqr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, - std::int64_t ldc, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t ldc, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side), @@ -2870,12 +2870,12 @@ ORMQRF_LAUNCHER_SCRATCH(double, cusolverDnDormqr_bufferSize) #undef ORMQRF_LAUNCHER_SCRATCH template -inline void ormtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ormtr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t lda, std::int64_t ldc, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t lda, std::int64_t ldc, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side), @@ -2904,11 +2904,11 @@ ORMTR_LAUNCHER_SCRATCH(double, cusolverDnDormtr_bufferSize) #undef ORMTR_LAUNCHER_SCRATCH template -inline void potrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void potrf_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -2952,11 +2952,11 @@ POTRS_LAUNCHER_SCRATCH(std::complex) #undef POTRS_LAUNCHER_SCRATCH template -inline void potri_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void potri_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -2984,11 +2984,11 @@ POTRI_LAUNCHER_SCRATCH(std::complex, cusolverDnZpotri_bufferSize) #undef POTRI_LAUNCHER_SCRATCH template -inline void sytrf_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void sytrf_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, n, nullptr, lda, scratch_size); @@ -3015,11 +3015,11 @@ SYTRF_LAUNCHER_SCRATCH(std::complex, cusolverDnZsytrf_bufferSize) #undef SYTRF_LAUNCHER_SCRATCH template -inline void syevd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void syevd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t lda, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t lda, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_job(jobz), @@ -3047,12 +3047,12 @@ SYEVD_LAUNCHER_SCRATCH(double, cusolverDnDsyevd_bufferSize) #undef SYEVD_LAUNCHER_SCRATCH template -inline void sygvd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void sygvd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cusolver_itype(itype), @@ -3080,11 +3080,11 @@ SYGVD_LAUNCHER_SCRATCH(double, cusolverDnDsygvd_bufferSize) #undef SYGVD_LAUNCHER_SCRATCH template -inline void sytrd_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void sytrd_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -3110,21 +3110,21 @@ SYTRD_LAUNCHER_SCRATCH(double, cusolverDnDsytrd_bufferSize) #undef SYTRD_LAUNCHER_SCRATCH template <> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { throw unimplemented("lapack", "trtrs_scratchpad_size"); } template <> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { throw unimplemented("lapack", "trtrs_scratchpad_size"); } template <> -std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -3132,7 +3132,7 @@ std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, onea throw unimplemented("lapack", "trtrs_scratchpad_size"); } template <> -std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -3141,11 +3141,11 @@ std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, one } template -inline void ungbr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ungbr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::int64_t lda, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t k, std::int64_t lda, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_generate(vec), m, n, k, @@ -3172,11 +3172,11 @@ UNGBR_LAUNCHER_SCRATCH(std::complex, cusolverDnZungbr_bufferSize) #undef UNGBR_LAUNCHER_SCRATCH template -inline void ungqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ungqr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, m, n, k, nullptr, lda, nullptr, @@ -3202,11 +3202,11 @@ UNGQR_LAUNCHER_SCRATCH(std::complex, cusolverDnZungqr_bufferSize) #undef UNGQR_LAUNCHER_SCRATCH template -inline void ungtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void ungtr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, - int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_fill_mode(uplo), n, @@ -3232,7 +3232,7 @@ UNGTR_LAUNCHER_SCRATCH(std::complex, cusolverDnZungtr_bufferSize) #undef UNGTR_LAUNCHER_SCRATCH template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -3240,7 +3240,7 @@ std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, onea throw unimplemented("lapack", "unmrq_scratchpad_size"); } template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -3249,12 +3249,12 @@ std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, one } template -inline void unmqr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void unmqr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, - std::int64_t ldc, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t ldc, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side), @@ -3282,12 +3282,12 @@ UNMQR_LAUNCHER_SCRATCH(std::complex, cusolverDnZunmqr_bufferSize) #undef UNMQR_LAUNCHER_SCRATCH template -inline void unmtr_scratchpad_size(const char *func_name, Func func, sycl::queue &queue, +inline void unmtr_scratchpad_size(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t lda, std::int64_t ldc, int *scratch_size) { - queue.submit([&](sycl::handler &cgh) { - onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler &sc) { + std::int64_t lda, std::int64_t ldc, int* scratch_size) { + queue.submit([&](sycl::handler& cgh) { + onemkl_cusolver_host_task(cgh, queue, [=](CusolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); cusolverStatus_t err; CUSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_cublas_side_mode(side), diff --git a/src/lapack/backends/cusolver/cusolver_scope_handle.cpp b/src/lapack/backends/cusolver/cusolver_scope_handle.cpp index a0c9c6b6f..a49febd20 100644 --- a/src/lapack/backends/cusolver/cusolver_scope_handle.cpp +++ b/src/lapack/backends/cusolver/cusolver_scope_handle.cpp @@ -44,7 +44,7 @@ thread_local cusolver_handle CusolverScopedContextHandler::handle_he #endif CusolverScopedContextHandler::CusolverScopedContextHandler(sycl::queue queue, - sycl::interop_handle &ih) + sycl::interop_handle& ih) : ih(ih), needToRecover_(false) { placedContext_ = new sycl::context(queue.get_context()); @@ -74,8 +74,8 @@ CusolverScopedContextHandler::~CusolverScopedContextHandler() noexcept(false) { delete placedContext_; } -void ContextCallback(void *userData) { - auto *ptr = static_cast *>(userData); +void ContextCallback(void* userData) { + auto* ptr = static_cast*>(userData); if (!ptr) { return; } @@ -93,7 +93,7 @@ void ContextCallback(void *userData) { } } -cusolverDnHandle_t CusolverScopedContextHandler::get_handle(const sycl::queue &queue) { +cusolverDnHandle_t CusolverScopedContextHandler::get_handle(const sycl::queue& queue) { auto cudaDevice = ih.get_native_device(); CUresult cuErr; CUcontext desired; @@ -140,10 +140,10 @@ cusolverDnHandle_t CusolverScopedContextHandler::get_handle(const sycl::queue &q return handle; } -CUstream CusolverScopedContextHandler::get_stream(const sycl::queue &queue) { +CUstream CusolverScopedContextHandler::get_stream(const sycl::queue& queue) { return sycl::get_native(queue); } -sycl::context CusolverScopedContextHandler::get_context(const sycl::queue &queue) { +sycl::context CusolverScopedContextHandler::get_context(const sycl::queue& queue) { return queue.get_context(); } diff --git a/src/lapack/backends/cusolver/cusolver_scope_handle.hpp b/src/lapack/backends/cusolver/cusolver_scope_handle.hpp index 390f9bb46..123562bf3 100644 --- a/src/lapack/backends/cusolver/cusolver_scope_handle.hpp +++ b/src/lapack/backends/cusolver/cusolver_scope_handle.hpp @@ -89,19 +89,19 @@ cuSolver handle to the SYCL context. class CusolverScopedContextHandler { CUcontext original_; - sycl::context *placedContext_; + sycl::context* placedContext_; bool needToRecover_; - sycl::interop_handle &ih; + sycl::interop_handle& ih; #ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED static thread_local cusolver_handle handle_helper; #else static thread_local cusolver_handle handle_helper; #endif - CUstream get_stream(const sycl::queue &queue); - sycl::context get_context(const sycl::queue &queue); + CUstream get_stream(const sycl::queue& queue); + sycl::context get_context(const sycl::queue& queue); public: - CusolverScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + CusolverScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); ~CusolverScopedContextHandler() noexcept(false); /** @@ -111,7 +111,7 @@ class CusolverScopedContextHandler { * @param queue sycl queue. * @return cusolverDnHandle_t a handle to construct cusolver routines */ - cusolverDnHandle_t get_handle(const sycl::queue &queue); + cusolverDnHandle_t get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. template @@ -120,7 +120,7 @@ class CusolverScopedContextHandler { return reinterpret_cast(cudaPtr); } - void wait_stream(const sycl::queue &queue) { + void wait_stream(const sycl::queue& queue) { cuStreamSynchronize(get_stream(queue)); } }; diff --git a/src/lapack/backends/cusolver/cusolver_task.hpp b/src/lapack/backends/cusolver/cusolver_task.hpp index 00e6e26be..8467007c9 100644 --- a/src/lapack/backends/cusolver/cusolver_task.hpp +++ b/src/lapack/backends/cusolver/cusolver_task.hpp @@ -49,7 +49,7 @@ namespace lapack { namespace cusolver { template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.host_task([f, queue](sycl::interop_handle ih) { auto sc = CusolverScopedContextHandler(queue, ih); f(sc); @@ -58,7 +58,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } template -static inline void onemkl_cusolver_host_task(H &cgh, sycl::queue queue, F f) { +static inline void onemkl_cusolver_host_task(H& cgh, sycl::queue queue, F f) { (void)host_task_internal(cgh, queue, f); } diff --git a/src/lapack/backends/mkl_common/mkl_lapack.cxx b/src/lapack/backends/mkl_common/mkl_lapack.cxx index 8573bffd9..055531f7c 100644 --- a/src/lapack/backends/mkl_common/mkl_lapack.cxx +++ b/src/lapack/backends/mkl_common/mkl_lapack.cxx @@ -17,1861 +17,1861 @@ * SPDX-License-Identifier: Apache-2.0 *******************************************************************************/ -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, sycl::buffer> &taup, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, sycl::buffer &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, sycl::buffer &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tauq, - sycl::buffer> &taup, sycl::buffer> &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tauq, + sycl::buffer>& taup, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, +void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, +void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, +void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, +void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tauq, double *taup, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tauq, double* taup, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tauq, float *taup, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tauq, float* taup, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gebrd(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gerqf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, std::complex *a, std::int64_t lda, - float *s, std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, + float* s, std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::gesvd(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::heevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::hetrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::hetrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ormqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potri(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + float* a, std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + double* a, std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *w, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* w, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *w, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* w, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::syevd(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *w, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *w, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrd(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::sytrf(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::trtrs(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungbr(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungtr(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmrq(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmqr(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return ::oneapi::mkl::lapack::unmtr(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, sycl::buffer &b, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::geqrf_batch(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrf_batch(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getri_batch(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, float *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, double *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, std::int64_t** ipiv, + float** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, std::int64_t** ipiv, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::getrs_batch(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float **a, std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float** a, std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double **a, std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double** a, std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::orgqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrf_batch(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return ::oneapi::mkl::lapack::potrs_batch(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return ::oneapi::mkl::lapack::ungqr_batch(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } template <> -std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gebrd_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gebrd_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t gebrd_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gebrd_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gebrd_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t gebrd_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gebrd_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gebrd_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gerqf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gerqf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gerqf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::gerqf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::geqrf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::geqrf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::geqrf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::geqrf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { return ::oneapi::mkl::lapack::gesvd_scratchpad_size(queue, jobu, jobvt, m, n, lda, ldu, ldvt); } template <> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { @@ -1879,7 +1879,7 @@ std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobs ldvt); } template <> -std::int64_t gesvd_scratchpad_size>(sycl::queue &queue, +std::int64_t gesvd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, @@ -1888,7 +1888,7 @@ std::int64_t gesvd_scratchpad_size>(sycl::queue &queue, n, lda, ldu, ldvt); } template <> -std::int64_t gesvd_scratchpad_size>(sycl::queue &queue, +std::int64_t gesvd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, @@ -1897,57 +1897,57 @@ std::int64_t gesvd_scratchpad_size>(sycl::queue &queue, n, lda, ldu, ldvt); } template <> -std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getrf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getrf_scratchpad_size(queue, m, n, lda); } template <> -std::int64_t getrf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getrf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t getrf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getrf_scratchpad_size>(queue, m, n, lda); } template <> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) { +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getri_scratchpad_size(queue, n, lda); } template <> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) { +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getri_scratchpad_size(queue, n, lda); } template <> -std::int64_t getri_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getri_scratchpad_size>(queue, n, lda); } template <> -std::int64_t getri_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::getri_scratchpad_size>(queue, n, lda); } template <> -std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::getrs_scratchpad_size(queue, trans, n, nrhs, lda, ldb); } template <> -std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::getrs_scratchpad_size(queue, trans, n, nrhs, lda, ldb); } template <> -std::int64_t getrs_scratchpad_size>(sycl::queue &queue, +std::int64_t getrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -1955,7 +1955,7 @@ std::int64_t getrs_scratchpad_size>(sycl::queue &queue, lda, ldb); } template <> -std::int64_t getrs_scratchpad_size>(sycl::queue &queue, +std::int64_t getrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -1963,21 +1963,21 @@ std::int64_t getrs_scratchpad_size>(sycl::queue &queue, lda, ldb); } template <> -std::int64_t heevd_scratchpad_size>(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t heevd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::heevd_scratchpad_size>(queue, jobz, uplo, n, lda); } template <> -std::int64_t heevd_scratchpad_size>(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t heevd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::heevd_scratchpad_size>(queue, jobz, uplo, n, lda); } template <> -std::int64_t hegvd_scratchpad_size>(sycl::queue &queue, std::int64_t itype, +std::int64_t hegvd_scratchpad_size>(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -1985,7 +1985,7 @@ std::int64_t hegvd_scratchpad_size>(sycl::queue &queue, std: uplo, n, lda, ldb); } template <> -std::int64_t hegvd_scratchpad_size>(sycl::queue &queue, std::int64_t itype, +std::int64_t hegvd_scratchpad_size>(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -1993,59 +1993,59 @@ std::int64_t hegvd_scratchpad_size>(sycl::queue &queue, std uplo, n, lda, ldb); } template <> -std::int64_t hetrd_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::hetrd_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t hetrd_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrd_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::hetrd_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::hetrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::hetrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::orgbr_scratchpad_size(queue, vect, m, n, k, lda); } template <> -std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, +std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::orgbr_scratchpad_size(queue, vect, m, n, k, lda); } template <> -std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::orgtr_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::orgtr_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::orgqr_scratchpad_size(queue, m, n, k, lda); } template <> -std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::orgqr_scratchpad_size(queue, m, n, k, lda); } template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2053,7 +2053,7 @@ std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2061,7 +2061,7 @@ std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2069,7 +2069,7 @@ std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2077,7 +2077,7 @@ std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2085,7 +2085,7 @@ std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2093,129 +2093,129 @@ std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side ldc); } template <> -std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potrf_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potrf_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t potrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t potrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::potrs_scratchpad_size(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::potrs_scratchpad_size(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::potrs_scratchpad_size>(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::potrs_scratchpad_size>(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potri_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potri_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t potri_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potri_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potri_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t potri_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potri_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::potri_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrf_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrf_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrf_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::syevd_scratchpad_size(queue, jobz, uplo, n, lda); } template <> -std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::syevd_scratchpad_size(queue, jobz, uplo, n, lda); } template <> -std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::sygvd_scratchpad_size(queue, itype, jobz, uplo, n, lda, ldb); } template <> -std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, +std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { return ::oneapi::mkl::lapack::sygvd_scratchpad_size(queue, itype, jobz, uplo, n, lda, ldb); } template <> -std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrd_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::sytrd_scratchpad_size(queue, uplo, n, lda); } template <> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2223,7 +2223,7 @@ std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo lda, ldb); } template <> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2231,7 +2231,7 @@ std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo lda, ldb); } template <> -std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -2240,7 +2240,7 @@ std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, onea queue, uplo, trans, diag, n, nrhs, lda, ldb); } template <> -std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -2249,7 +2249,7 @@ std::int64_t trtrs_scratchpad_size>(sycl::queue &queue, one queue, uplo, trans, diag, n, nrhs, lda, ldb); } template <> -std::int64_t ungbr_scratchpad_size>(sycl::queue &queue, +std::int64_t ungbr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { @@ -2257,7 +2257,7 @@ std::int64_t ungbr_scratchpad_size>(sycl::queue &queue, lda); } template <> -std::int64_t ungbr_scratchpad_size>(sycl::queue &queue, +std::int64_t ungbr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { @@ -2265,29 +2265,29 @@ std::int64_t ungbr_scratchpad_size>(sycl::queue &queue, lda); } template <> -std::int64_t ungqr_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t ungqr_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::ungqr_scratchpad_size>(queue, m, n, k, lda); } template <> -std::int64_t ungqr_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t ungqr_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return ::oneapi::mkl::lapack::ungqr_scratchpad_size>(queue, m, n, k, lda); } template <> -std::int64_t ungtr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t ungtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::ungtr_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t ungtr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t ungtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return ::oneapi::mkl::lapack::ungtr_scratchpad_size>(queue, uplo, n, lda); } template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2296,7 +2296,7 @@ std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, onea n, k, lda, ldc); } template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2305,7 +2305,7 @@ std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, one n, k, lda, ldc); } template <> -std::int64_t unmqr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmqr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2314,7 +2314,7 @@ std::int64_t unmqr_scratchpad_size>(sycl::queue &queue, onea n, k, lda, ldc); } template <> -std::int64_t unmqr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmqr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2323,7 +2323,7 @@ std::int64_t unmqr_scratchpad_size>(sycl::queue &queue, one n, k, lda, ldc); } template <> -std::int64_t unmtr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, @@ -2332,7 +2332,7 @@ std::int64_t unmtr_scratchpad_size>(sycl::queue &queue, onea trans, m, n, lda, ldc); } template <> -std::int64_t unmtr_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmtr_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, @@ -2341,14 +2341,14 @@ std::int64_t unmtr_scratchpad_size>(sycl::queue &queue, one queue, side, uplo, trans, m, n, lda, ldc); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size(queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2356,7 +2356,7 @@ std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_ stride_ipiv, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2365,7 +2365,7 @@ std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2374,14 +2374,14 @@ std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queu queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return ::oneapi::mkl::lapack::getri_batch_scratchpad_size(queue, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2389,7 +2389,7 @@ std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_ stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2398,7 +2398,7 @@ std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue queue, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2407,7 +2407,7 @@ std::int64_t getri_batch_scratchpad_size>(sycl::queue &queu queue, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, @@ -2416,7 +2416,7 @@ std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl: queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, @@ -2426,7 +2426,7 @@ std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size>( @@ -2434,28 +2434,28 @@ std::int64_t getrs_batch_scratchpad_size>( } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size>( queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size(queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size(queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, @@ -2464,7 +2464,7 @@ std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, @@ -2473,21 +2473,21 @@ std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queu queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size(queue, uplo, n, lda, stride_a, batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size(queue, uplo, n, lda, stride_a, batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, @@ -2496,7 +2496,7 @@ std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue queue, uplo, n, lda, stride_a, batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, @@ -2505,7 +2505,7 @@ std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queu queue, uplo, n, lda, stride_a, batch_size); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { @@ -2513,7 +2513,7 @@ std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl: queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, @@ -2523,20 +2523,20 @@ std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size>( queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size); } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size>( queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2544,7 +2544,7 @@ std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t stride_tau, batch_size); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2553,241 +2553,241 @@ std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_ } template <> std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size>( queue, m, n, k, lda, stride_a, stride_tau, batch_size); } template <> std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size>( queue, m, n, k, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size>( queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrf_batch_scratchpad_size>( queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getri_batch_scratchpad_size(queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getri_batch_scratchpad_size(queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getri_batch_scratchpad_size>( queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getri_batch_scratchpad_size>( queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size(queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size( queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size>( queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::getrs_batch_scratchpad_size>( queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size>( queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::geqrf_batch_scratchpad_size>( queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::int64_t *lda, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size(queue, m, n, k, lda, group_count, group_sizes); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::orgqr_batch_scratchpad_size(queue, m, n, k, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size(queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size(queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, - oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, + oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size>( queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, - oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, + oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrf_batch_scratchpad_size>( queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size(queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size( queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size>( queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::potrs_batch_scratchpad_size>( queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t ungqr_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +std::int64_t ungqr_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size>( queue, m, n, k, lda, group_count, group_sizes); } template <> -std::int64_t ungqr_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +std::int64_t ungqr_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return ::oneapi::mkl::lapack::ungqr_batch_scratchpad_size>( queue, m, n, k, lda, group_count, group_sizes); } diff --git a/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp b/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp index 1932bb959..d0ba37e7a 100644 --- a/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp +++ b/src/lapack/backends/mkl_common/mkl_lapack_backend.hpp @@ -28,1235 +28,1235 @@ namespace oneapi { namespace mkl { namespace lapack { -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, sycl::buffer> &taup, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, sycl::buffer &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, sycl::buffer>& taup, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, sycl::buffer &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tauq, - sycl::buffer> &taup, sycl::buffer> &scratchpad, +void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tauq, + sycl::buffer>& taup, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer &u, std::int64_t ldu, sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer& u, std::int64_t ldu, sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, +void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &w, sycl::buffer &scratchpad, +void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, +void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, +void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size); -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, sycl::buffer &b, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, float *d, float *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *d, double *e, double *tauq, double *taup, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *d, float *e, float *tauq, float *taup, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, double *d, double *e, std::complex *tauq, - std::complex *taup, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t *ipiv, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, float *s, float *u, - std::int64_t ldu, float *vt, std::int64_t ldvt, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, std::complex *a, std::int64_t lda, - float *s, std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, std::complex *a, std::int64_t lda, - double *s, std::complex *u, std::int64_t ldu, std::complex *vt, - std::int64_t ldvt, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, float *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, double *w, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, double *a, - std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, float *a, - std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - float *a, std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - double *a, std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *w, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *w, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *w, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *w, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, float* d, float* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* d, double* e, double* tauq, double* taup, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* d, float* e, float* tauq, float* taup, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, double* d, double* e, std::complex* tauq, + std::complex* taup, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t* ipiv, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, float* s, float* u, + std::int64_t ldu, float* vt, std::int64_t ldvt, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, + float* s, std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, std::complex* a, std::int64_t lda, + double* s, std::complex* u, std::int64_t ldu, std::complex* vt, + std::int64_t ldvt, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, float* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, double* w, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + float* a, std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + double* a, std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* w, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* w, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* w, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* w, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, std::int64_t** ipiv, + float** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, std::int64_t** ipiv, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies = {}); +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float **a, std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double **a, std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, + const std::vector& dependencies = {}); +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float** a, std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double** a, std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies = {}); +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies = {}); + const std::vector& dependencies = {}); +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies = {}); template = nullptr> -std::int64_t gebrd_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gebrd_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t geqrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t gesvd_scratchpad_size(sycl::queue &queue, oneapi::mkl::jobsvd jobu, +std::int64_t gesvd_scratchpad_size(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); template = nullptr> -std::int64_t getrf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda); +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t getrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, +std::int64_t getrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t heevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t heevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hegvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t hegvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t hetrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t hetrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t hetrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t orgbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t orgtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t orgtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t orgqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t ormtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t potrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t potrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t potri_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potri_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sytrf_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrf_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t syevd_scratchpad_size(sycl::queue &queue, oneapi::mkl::job jobz, +std::int64_t syevd_scratchpad_size(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t sygvd_scratchpad_size(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, +std::int64_t sygvd_scratchpad_size(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t sytrd_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t sytrd_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t trtrs_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t trtrs_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); template = nullptr> -std::int64_t ungbr_scratchpad_size(sycl::queue &queue, oneapi::mkl::generate vect, std::int64_t m, +std::int64_t ungbr_scratchpad_size(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungqr_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); template = nullptr> -std::int64_t ungtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t ungtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); template = nullptr> -std::int64_t unmrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmqr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmqr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t unmtr_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmtr_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); template = nullptr> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); template = nullptr> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes); +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes); template = nullptr> -std::int64_t ungqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes); +std::int64_t ungqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes); } // namespace lapack } // namespace mkl diff --git a/src/lapack/backends/rocsolver/rocsolver_batch.cpp b/src/lapack/backends/rocsolver/rocsolver_batch.cpp index 0b4b877e8..f970c76dd 100644 --- a/src/lapack/backends/rocsolver/rocsolver_batch.cpp +++ b/src/lapack/backends/rocsolver/rocsolver_batch.cpp @@ -31,476 +31,476 @@ namespace rocsolver { // BATCH BUFFER API -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "geqrf_batch"); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &tau, - std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& tau, + std::int64_t stride_tau, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "geqrf_batch"); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "geqrf_batch"); } -void geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "geqrf_batch"); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri_batch"); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri_batch"); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri_batch"); } -void getri_batch(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getri_batch(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri_batch"); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, sycl::buffer &b, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrs_batch"); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrs_batch"); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrs_batch"); } -void getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrs_batch"); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrf_batch"); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrf_batch"); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrf_batch"); } -void getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getrf_batch"); } -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "orgqr_batch"); } -void orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "orgqr_batch"); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrf_batch"); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrf_batch"); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrf_batch"); } -void potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrf_batch"); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrs_batch"); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrs_batch"); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrs_batch"); } -void potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "potrs_batch"); } -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ungqr_batch"); } -void ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ungqr_batch"); } // BATCH USM API -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *tau, +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event geqrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "geqrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getrf_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getrf_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrf_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, +sycl::event getri_batch(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getri_batch(sycl::queue &queue, std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getri_batch(sycl::queue& queue, std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getri_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, float *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, float* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, double *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, float **a, std::int64_t *lda, std::int64_t **ipiv, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, float** a, std::int64_t* lda, std::int64_t** ipiv, + float** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, double **a, std::int64_t *lda, std::int64_t **ipiv, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, double** a, std::int64_t* lda, std::int64_t** ipiv, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event getrs_batch(sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getrs_batch"); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, std::int64_t stride_a, float *tau, - std::int64_t stride_tau, std::int64_t batch_size, float *scratchpad, +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + float* a, std::int64_t lda, std::int64_t stride_a, float* tau, + std::int64_t stride_tau, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "orgqr_batch"); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, std::int64_t stride_a, double *tau, - std::int64_t stride_tau, std::int64_t batch_size, double *scratchpad, +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + double* a, std::int64_t lda, std::int64_t stride_a, double* tau, + std::int64_t stride_tau, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "orgqr_batch"); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - float **a, std::int64_t *lda, float **tau, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + float** a, std::int64_t* lda, float** tau, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "orgqr_batch"); } -sycl::event orgqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - double **a, std::int64_t *lda, double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + double** a, std::int64_t* lda, double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "orgqr_batch"); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, float *a, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrf_batch"); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, double *a, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrf_batch"); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "potrf_batch"); } -sycl::event potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event potrf_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { throw unimplemented("lapack", "potrf_batch"); } template -inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, T **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, +inline sycl::event potrf_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, T** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; int64_t batch_size = 0; @@ -509,24 +509,24 @@ inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &qu batch_size += group_sizes[i]; } - int *info = (int *)malloc_device(sizeof(int) * batch_size, queue); - T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue); + int* info = (int*)malloc_device(sizeof(int) * batch_size, queue); + T** a_dev = (T**)malloc_device(sizeof(T*) * batch_size, queue); auto done_cpy = - queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); }); + queue.submit([&](sycl::handler& h) { h.memcpy(a_dev, a, batch_size * sizeof(T*)); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } cgh.depends_on(done_cpy); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a_dev); - auto *info_ = reinterpret_cast(info); + auto** a_ = reinterpret_cast(a_dev); + auto* info_ = reinterpret_cast(info); ROCSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_rocblas_fill_mode(uplo[i]), (int)n[i], a_ + offset, (int)lda[i], info_ + offset, (int)group_sizes[i]); @@ -540,9 +540,9 @@ inline sycl::event potrf_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRF_BATCH_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ sycl::event potrf_batch( \ - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, TYPE **a, std::int64_t *lda, \ - std::int64_t group_count, std::int64_t *group_sizes, TYPE *scratchpad, \ - std::int64_t scratchpad_size, const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, TYPE** a, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes, TYPE* scratchpad, \ + std::int64_t scratchpad_size, const std::vector& dependencies) { \ return potrf_batch(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, \ group_count, group_sizes, scratchpad, scratchpad_size, dependencies); \ } @@ -554,44 +554,44 @@ POTRF_BATCH_LAUNCHER_USM(std::complex, rocsolver_zpotrf_batched) #undef POTRF_BATCH_LAUNCHER_USM -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t stride_a, - float *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, + float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrs_batch"); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, std::int64_t stride_a, - double *b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, + double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrs_batch"); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrs_batch"); } -sycl::event potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, +sycl::event potrs_batch(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "potrs_batch"); } template -inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, T **a, - std::int64_t *lda, T **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, T *scratchpad, +inline sycl::event potrs_batch(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, T** a, + std::int64_t* lda, T** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; int64_t batch_size = 0; @@ -605,28 +605,28 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu "rocsolver potrs_batch only supports nrhs = 1"); } - T **a_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue); - T **b_dev = (T **)malloc_device(sizeof(T *) * batch_size, queue); + T** a_dev = (T**)malloc_device(sizeof(T*) * batch_size, queue); + T** b_dev = (T**)malloc_device(sizeof(T*) * batch_size, queue); auto done_cpy_a = - queue.submit([&](sycl::handler &h) { h.memcpy(a_dev, a, batch_size * sizeof(T *)); }); + queue.submit([&](sycl::handler& h) { h.memcpy(a_dev, a, batch_size * sizeof(T*)); }); auto done_cpy_b = - queue.submit([&](sycl::handler &h) { h.memcpy(b_dev, b, batch_size * sizeof(T *)); }); + queue.submit([&](sycl::handler& h) { h.memcpy(b_dev, b, batch_size * sizeof(T*)); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } cgh.depends_on(done_cpy_a); cgh.depends_on(done_cpy_b); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); int64_t offset = 0; rocblas_status err; for (int64_t i = 0; i < group_count; i++) { - auto **a_ = reinterpret_cast(a_dev); - auto **b_ = reinterpret_cast(b_dev); + auto** a_ = reinterpret_cast(a_dev); + auto** b_ = reinterpret_cast(b_dev); ROCSOLVER_ERROR_FUNC_T(func_name, func, err, handle, get_rocblas_fill_mode(uplo[i]), (int)n[i], (int)nrhs[i], a_ + offset, (int)lda[i], b_ + offset, (int)ldb[i], (int)group_sizes[i]); @@ -640,10 +640,10 @@ inline sycl::event potrs_batch(const char *func_name, Func func, sycl::queue &qu // Scratchpad memory not needed as parts of buffer a is used as workspace memory #define POTRS_BATCH_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ sycl::event potrs_batch( \ - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, \ - TYPE **a, std::int64_t *lda, TYPE **b, std::int64_t *ldb, std::int64_t group_count, \ - std::int64_t *group_sizes, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, \ + TYPE** a, std::int64_t* lda, TYPE** b, std::int64_t* ldb, std::int64_t group_count, \ + std::int64_t* group_sizes, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrs_batch(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, \ ldb, group_count, group_sizes, scratchpad, scratchpad_size, \ dependencies); \ @@ -656,52 +656,52 @@ POTRS_BATCH_LAUNCHER_USM(std::complex, rocsolver_zpotrs_batched) #undef POTRS_BATCH_LAUNCHER_USM -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "ungqr_batch"); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::complex *tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "ungqr_batch"); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "ungqr_batch"); } -sycl::event ungqr_batch(sycl::queue &queue, std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::complex **a, std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event ungqr_batch(sycl::queue& queue, std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::complex** a, std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "ungqr_batch"); } // BATCH SCRATCHPAD API template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -709,7 +709,7 @@ std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -717,20 +717,20 @@ std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queu throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -738,7 +738,7 @@ std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -746,7 +746,7 @@ std::int64_t getri_batch_scratchpad_size>(sycl::queue &queu throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, @@ -754,7 +754,7 @@ std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl: throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose trans, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, @@ -763,32 +763,32 @@ std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, @@ -796,7 +796,7 @@ std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, @@ -805,19 +805,19 @@ std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queu } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { throw unimplemented("lapack", "potrf_batch_scratchpad_size"); } template <> -std::int64_t potrf_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrf_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { throw unimplemented("lapack", "potrf_batch_scratchpad_size"); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, @@ -825,7 +825,7 @@ std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue throw unimplemented("lapack", "potrf_batch_scratchpad_size"); } template <> -std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, @@ -833,14 +833,14 @@ std::int64_t potrf_batch_scratchpad_size>(sycl::queue &queu throw unimplemented("lapack", "potrf_batch_scratchpad_size"); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("lapack", "potrs_batch_scratchpad_size"); } template <> -std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t potrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, @@ -849,25 +849,25 @@ std::int64_t potrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("lapack", "potrs_batch_scratchpad_size"); } template <> std::int64_t potrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { throw unimplemented("lapack", "potrs_batch_scratchpad_size"); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { throw unimplemented("lapack", "orgqr_batch_scratchpad_size"); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -875,148 +875,148 @@ std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_ } template <> std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { throw unimplemented("lapack", "ungqr_batch_scratchpad_size"); } template <> std::int64_t ungqr_batch_scratchpad_size>( - sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { throw unimplemented("lapack", "ungqr_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrf_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getri_batch_scratchpad_size(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getri_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getri_batch_scratchpad_size"); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> -std::int64_t getrs_batch_scratchpad_size(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, +std::int64_t getrs_batch_scratchpad_size(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> std::int64_t getrs_batch_scratchpad_size>( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes) { + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes) { throw unimplemented("lapack", "getrs_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t geqrf_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "geqrf_batch_scratchpad_size"); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::int64_t *lda, +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "orgqr_batch_scratchpad_size"); } template <> -std::int64_t orgqr_batch_scratchpad_size(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t orgqr_batch_scratchpad_size(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { throw unimplemented("lapack", "orgqr_batch_scratchpad_size"); } // rocsolverDnXpotrfBatched does not use scratchpad memory -#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t potrf_batch_scratchpad_size( \ - sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * lda, \ - std::int64_t group_count, std::int64_t * group_sizes) { \ - return 0; \ +#define POTRF_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t potrf_batch_scratchpad_size( \ + sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t* n, std::int64_t* lda, \ + std::int64_t group_count, std::int64_t* group_sizes) { \ + return 0; \ } POTRF_GROUP_LAUNCHER_SCRATCH(float) @@ -1027,13 +1027,13 @@ POTRF_GROUP_LAUNCHER_SCRATCH(std::complex) #undef POTRF_GROUP_LAUNCHER_SCRATCH // rocsolverDnXpotrsBatched does not use scratchpad memory -#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ - template <> \ - std::int64_t potrs_batch_scratchpad_size( \ - sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t * n, std::int64_t * nrhs, \ - std::int64_t * lda, std::int64_t * ldb, std::int64_t group_count, \ - std::int64_t * group_sizes) { \ - return 0; \ +#define POTRS_GROUP_LAUNCHER_SCRATCH(TYPE) \ + template <> \ + std::int64_t potrs_batch_scratchpad_size( \ + sycl::queue & queue, oneapi::mkl::uplo * uplo, std::int64_t* n, std::int64_t* nrhs, \ + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, \ + std::int64_t* group_sizes) { \ + return 0; \ } POTRS_GROUP_LAUNCHER_SCRATCH(float) @@ -1044,19 +1044,19 @@ POTRS_GROUP_LAUNCHER_SCRATCH(std::complex) #undef POTRS_GROUP_LAUNCHER_SCRATCH template <> -std::int64_t ungqr_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +std::int64_t ungqr_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "ungqr_batch_scratchpad_size"); } template <> -std::int64_t ungqr_batch_scratchpad_size>(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, +std::int64_t ungqr_batch_scratchpad_size>(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { throw unimplemented("lapack", "ungqr_batch_scratchpad_size"); } diff --git a/src/lapack/backends/rocsolver/rocsolver_handle.hpp b/src/lapack/backends/rocsolver/rocsolver_handle.hpp index c44463ef4..fff7d591c 100644 --- a/src/lapack/backends/rocsolver/rocsolver_handle.hpp +++ b/src/lapack/backends/rocsolver/rocsolver_handle.hpp @@ -30,10 +30,10 @@ namespace rocsolver { template struct rocsolver_handle { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t rocsolver_handle_mapper_{}; ~rocsolver_handle() noexcept(false) { - for (auto &handle_pair : rocsolver_handle_mapper_) { + for (auto& handle_pair : rocsolver_handle_mapper_) { rocblas_status err; if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); diff --git a/src/lapack/backends/rocsolver/rocsolver_helper.hpp b/src/lapack/backends/rocsolver/rocsolver_helper.hpp index dade1df64..d3eb06432 100644 --- a/src/lapack/backends/rocsolver/rocsolver_helper.hpp +++ b/src/lapack/backends/rocsolver/rocsolver_helper.hpp @@ -81,7 +81,7 @@ void overflow_check(Index index, Next... indices) { class rocsolver_error : virtual public std::runtime_error { protected: - inline const char *rocsolver_error_map(rocblas_status error) { + inline const char* rocsolver_error_map(rocblas_status error) { return rocblas_status_to_string(error); } @@ -111,7 +111,7 @@ class rocsolver_error : virtual public std::runtime_error { class hip_error : virtual public std::runtime_error { protected: - inline const char *hip_error_map(hipError_t result) { + inline const char* hip_error_map(hipError_t result) { return hipGetErrorName(result); } int error_number; ///< error number @@ -247,12 +247,12 @@ struct RocmEquivalentType> { /* devinfo */ -inline int get_rocsolver_devinfo(sycl::queue &queue, sycl::buffer &devInfo) { +inline int get_rocsolver_devinfo(sycl::queue& queue, sycl::buffer& devInfo) { sycl::host_accessor dev_info_{ devInfo }; return dev_info_[0]; } -inline int get_rocsolver_devinfo(sycl::queue &queue, const int *devInfo) { +inline int get_rocsolver_devinfo(sycl::queue& queue, const int* devInfo) { int dev_info_; queue.memcpy(&dev_info_, devInfo, sizeof(int)); queue.wait(); @@ -260,8 +260,8 @@ inline int get_rocsolver_devinfo(sycl::queue &queue, const int *devInfo) { } template -inline void lapack_info_check(sycl::queue &queue, DEVINFO_T devinfo, const char *func_name, - const char *cufunc_name) { +inline void lapack_info_check(sycl::queue& queue, DEVINFO_T devinfo, const char* func_name, + const char* cufunc_name) { queue.wait(); const int devinfo_ = get_rocsolver_devinfo(queue, devinfo); if (devinfo_ > 0) diff --git a/src/lapack/backends/rocsolver/rocsolver_lapack.cpp b/src/lapack/backends/rocsolver/rocsolver_lapack.cpp index e5e634ad0..42894af63 100644 --- a/src/lapack/backends/rocsolver/rocsolver_lapack.cpp +++ b/src/lapack/backends/rocsolver/rocsolver_lapack.cpp @@ -32,27 +32,27 @@ namespace rocsolver { // BUFFER APIs template -inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void gebrd(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tauq_acc = tauq.template get_access(cgh); auto taup_acc = taup.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tauq_ = sc.get_mem(tauq_acc); - auto taup_ = sc.get_mem(taup_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tauq_ = sc.get_mem(tauq_acc); + auto taup_ = sc.get_mem(taup_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_, taup_); @@ -61,10 +61,10 @@ inline void gebrd(const char *func_name, Func func, sycl::queue &queue, std::int } #define GEBRD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - void gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, \ - sycl::buffer &tauq, sycl::buffer &taup, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, \ + sycl::buffer& tauq, sycl::buffer& taup, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ gebrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \ scratchpad, scratchpad_size); \ } @@ -76,41 +76,41 @@ GEBRD_LAUNCHER(std::complex, double, rocsolver_zgebrd) #undef GEBRD_LAUNCHER -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } -void gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "gerqf"); } template -inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void geqrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_); }); @@ -118,8 +118,8 @@ inline void geqrf(const char *func_name, Func func, sycl::queue &queue, std::int } #define GEQRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ geqrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -133,9 +133,9 @@ GEQRF_LAUNCHER(std::complex, rocsolver_zgeqrf) #undef GEQRF_LAUNCHER template -void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -146,15 +146,15 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); sycl::buffer devInfo{ 1 }; - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv32_acc = ipiv32.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv32_ = sc.get_mem(ipiv32_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv32_ = sc.get_mem(ipiv32_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, ipiv32_, devInfo_); @@ -162,7 +162,7 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, }); // Copy from 32-bit buffer to 64-bit - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); @@ -174,8 +174,8 @@ void getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, } #define GETRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, \ + void getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ getrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \ scratchpad_size); \ @@ -188,32 +188,32 @@ GETRF_LAUNCHER(std::complex, rocsolver_zgetrf) #undef GETRF_LAUNCHER -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri"); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri"); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri"); } -void getri(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getri(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "getri"); } template -inline void getrs(const char *func_name, Func func, sycl::queue &queue, +inline void getrs(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, nrhs, lda, ldb); @@ -224,7 +224,7 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, std::uint64_t ipiv_size = ipiv.size(); sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { @@ -232,15 +232,15 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, }); }); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv_acc = ipiv32.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv_ = sc.get_mem(ipiv_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv_ = sc.get_mem(ipiv_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_operation(trans), n, nrhs, a_, lda, ipiv_, b_, ldb); @@ -249,10 +249,10 @@ inline void getrs(const char *func_name, Func func, sycl::queue &queue, } #define GETRS_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ getrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, b, ldb, \ scratchpad, scratchpad_size); \ } @@ -265,30 +265,30 @@ GETRS_LAUNCHER(std::complex, rocsolver_zgetrs) #undef GETRS_LAUNCHER template -inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, +inline void gesvd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, m, lda, ldu, ldvt, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto s_acc = s.template get_access(cgh); auto u_acc = u.template get_access(cgh); auto vt_acc = vt.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto s_ = sc.get_mem(s_acc); - auto u_ = sc.get_mem(u_acc); - auto vt_ = sc.get_mem(vt_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto s_ = sc.get_mem(s_acc); + auto u_ = sc.get_mem(u_acc); + auto vt_ = sc.get_mem(vt_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_jobsvd(jobu), get_rocsolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu, @@ -300,10 +300,10 @@ inline void gesvd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define GESVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - void gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, \ - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, \ + void gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, \ + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ gesvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, u, ldu, \ vt, ldvt, scratchpad, scratchpad_size); \ @@ -317,25 +317,25 @@ GESVD_LAUNCHER(std::complex, double, rocsolver_zgesvd) #undef GESVD_LAUNCHER template -inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +inline void heevd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -346,9 +346,9 @@ inline void heevd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define HEEVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - void heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ heevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \ scratchpad_size); \ } @@ -359,28 +359,28 @@ HEEVD_LAUNCHER(std::complex, double, rocsolver_zheevd) #undef HEEVD_LAUNCHER template -inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, +inline void hegvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype), get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, @@ -391,10 +391,10 @@ inline void hegvd(const char *func_name, Func func, sycl::queue &queue, std::int } #define HEGVD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - void hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ hegvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, \ w, scratchpad, scratchpad_size); \ } @@ -405,24 +405,24 @@ HEGVD_LAUNCHER(std::complex, double, rocsolver_zhegvd) #undef HEGVD_LAUNCHER template -inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void hetrd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, d_, e_, tau_); @@ -431,10 +431,10 @@ inline void hetrd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define HETRD_LAUNCHER(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - void hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, \ - sycl::buffer &e, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, \ + sycl::buffer& e, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ hetrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size); \ } @@ -444,32 +444,32 @@ HETRD_LAUNCHER(std::complex, double, rocsolver_zhetrd) #undef HETRD_LAUNCHER -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "hetrf"); } -void hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "hetrf"); } template -inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void orgbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m, n, k, a_, lda, tau_); @@ -478,9 +478,9 @@ inline void orgbr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORGBR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ orgbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -491,18 +491,18 @@ ORGBR_LAUNCHER(double, rocsolver_dorgbr) #undef ORGBR_LAUNCHER template -inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void orgqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_); }); @@ -510,9 +510,9 @@ inline void orgqr(const char *func_name, Func func, sycl::queue &queue, std::int } #define ORGQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ orgqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -523,18 +523,18 @@ ORGQR_LAUNCHER(double, rocsolver_dorgqr) #undef ORGQR_LAUNCHER template -inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void orgtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, tau_); @@ -543,8 +543,8 @@ inline void orgtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORGTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ orgtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -556,22 +556,22 @@ ORGTR_LAUNCHER(double, rocsolver_dorgtr) #undef ORGTR_LAUNCHER template -inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void ormtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_fill_mode(uplo), get_rocblas_operation(trans), @@ -581,10 +581,10 @@ inline void ormtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORMTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + void ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ ormtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, \ c, ldc, scratchpad, scratchpad_size); \ @@ -595,35 +595,35 @@ ORMTR_LAUNCHER(double, rocsolver_dormtr) #undef ORMTR_LAUNCHER -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ormrq"); } -void ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "ormrq"); } template -inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void ormqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_, @@ -633,10 +633,10 @@ inline void ormqr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define ORMQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ormqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ } @@ -647,19 +647,19 @@ ORMQR_LAUNCHER(double, rocsolver_dormqr) #undef ORMQR_LAUNCHER template -inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +inline void potrf(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); + auto a_ = sc.get_mem(a_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, devInfo_); @@ -669,8 +669,8 @@ inline void potrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size); \ } @@ -683,19 +683,19 @@ POTRF_LAUNCHER(std::complex, rocsolver_zpotrf) #undef POTRF_LAUNCHER template -inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +inline void potri(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); + auto a_ = sc.get_mem(a_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, devInfo_); @@ -705,8 +705,8 @@ inline void potri(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRI_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potri(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size); \ } @@ -719,19 +719,19 @@ POTRI_LAUNCHER(std::complex, rocsolver_zpotri) #undef POTRI_LAUNCHER template -inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, +inline void potrs(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, nrhs, a_, lda, b_, ldb); @@ -740,9 +740,9 @@ inline void potrs(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define POTRS_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ potrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \ scratchpad, scratchpad_size); \ } @@ -755,23 +755,23 @@ POTRS_LAUNCHER(std::complex, rocsolver_zpotrs) #undef POTRS_LAUNCHER template -inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void syevd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -782,9 +782,9 @@ inline void syevd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYEVD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ syevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, scratchpad, \ scratchpad_size); \ } @@ -795,26 +795,26 @@ SYEVD_LAUNCHER(double, rocsolver_dsyevd) #undef SYEVD_LAUNCHER template -inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void sygvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); sycl::buffer devInfo{ 1 }; - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto b_acc = b.template get_access(cgh); auto w_acc = w.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); auto scratch_acc = scratchpad.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto b_ = sc.get_mem(b_acc); - auto w_ = sc.get_mem(w_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); - auto scratch_ = sc.get_mem(scratch_acc); + auto a_ = sc.get_mem(a_acc); + auto b_ = sc.get_mem(b_acc); + auto w_ = sc.get_mem(w_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); + auto scratch_ = sc.get_mem(scratch_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype), get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, @@ -825,10 +825,10 @@ inline void sygvd(const char *func_name, Func func, sycl::queue &queue, std::int } #define SYGVD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, \ - sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, \ + sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ sygvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, b, ldb, \ w, scratchpad, scratchpad_size); \ } @@ -839,23 +839,23 @@ SYGVD_LAUNCHER(double, rocsolver_dsygvd) #undef SYGVD_LAUNCH template -inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void sytrd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto d_acc = d.template get_access(cgh); auto e_acc = e.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto d_ = sc.get_mem(d_acc); - auto e_ = sc.get_mem(e_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto d_ = sc.get_mem(d_acc); + auto e_ = sc.get_mem(e_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, d_, e_, tau_); @@ -864,9 +864,9 @@ inline void sytrd(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYTRD_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, \ - sycl::buffer &tau, sycl::buffer &scratchpad, \ + void sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, \ + sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ sytrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size); \ @@ -878,9 +878,9 @@ SYTRD_LAUNCHER(double, rocsolver_dsytrd) #undef SYTRD_LAUNCHER template -inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &scratchpad, +inline void sytrf(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); @@ -892,15 +892,15 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: std::uint64_t ipiv_size = n; sycl::buffer ipiv32(sycl::range<1>{ ipiv_size }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto ipiv32_acc = ipiv32.template get_access(cgh); auto devInfo_acc = devInfo.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto ipiv32_ = sc.get_mem(ipiv32_acc); - auto devInfo_ = sc.get_mem(devInfo_acc); + auto a_ = sc.get_mem(a_acc); + auto ipiv32_ = sc.get_mem(ipiv32_acc); + auto devInfo_ = sc.get_mem(devInfo_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, ipiv32_, devInfo_); @@ -908,7 +908,7 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: }); // Copy from 32-bit buffer to 64-bit - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); auto ipiv32_acc = ipiv32.template get_access(cgh); auto ipiv_acc = ipiv.template get_access(cgh); @@ -920,8 +920,8 @@ inline void sytrf(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define SYTRF_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, \ + void sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ sytrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, scratchpad, \ scratchpad_size); \ @@ -934,47 +934,47 @@ SYTRF_LAUNCHER(std::complex, rocsolver_zsytrf) #undef SYTRF_LAUNCHER -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } -void trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +void trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "trtrs"); } template -inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +inline void ungbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m, n, k, a_, lda, tau_); @@ -983,9 +983,9 @@ inline void ungbr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNGBR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, \ + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ungbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -996,18 +996,18 @@ UNGBR_LAUNCHER(std::complex, rocsolver_zungbr) #undef UNGBR_LAUNCHER template -inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void ungqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_); }); @@ -1015,9 +1015,9 @@ inline void ungqr(const char *func_name, Func func, sycl::queue &queue, std::int } #define UNGQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ ungqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, scratchpad, \ scratchpad_size); \ } @@ -1028,18 +1028,18 @@ UNGQR_LAUNCHER(std::complex, rocsolver_zungqr) #undef UNGQR_LAUNCHER template -inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +inline void ungtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, tau_); @@ -1048,8 +1048,8 @@ inline void ungtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNGTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, \ + void ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ ungtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, scratchpad, \ scratchpad_size); \ @@ -1060,37 +1060,37 @@ UNGTR_LAUNCHER(std::complex, rocsolver_zungtr) #undef UNGTR_LAUNCHER -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "unmrq"); } -void unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, +void unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { throw unimplemented("lapack", "unmrq"); } template -inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void unmqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_, @@ -1100,10 +1100,10 @@ inline void unmqr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNMQR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, \ - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, std::int64_t ldc, \ - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { \ + void unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, \ + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, std::int64_t ldc, \ + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { \ unmqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, tau, c, \ ldc, scratchpad, scratchpad_size); \ } @@ -1114,22 +1114,22 @@ UNMQR_LAUNCHER(std::complex, rocsolver_zunmqr) #undef UNMQR_LAUNCHER template -inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi::mkl::side side, +inline void unmtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + std::int64_t n, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - queue.submit([&](sycl::handler &cgh) { + queue.submit([&](sycl::handler& cgh) { auto a_acc = a.template get_access(cgh); auto tau_acc = tau.template get_access(cgh); auto c_acc = c.template get_access(cgh); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = sc.get_mem(a_acc); - auto tau_ = sc.get_mem(tau_acc); - auto c_ = sc.get_mem(c_acc); + auto a_ = sc.get_mem(a_acc); + auto tau_ = sc.get_mem(tau_acc); + auto c_ = sc.get_mem(c_acc); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_fill_mode(uplo), get_rocblas_operation(trans), @@ -1139,10 +1139,10 @@ inline void unmtr(const char *func_name, Func func, sycl::queue &queue, oneapi:: } #define UNMTR_LAUNCHER(TYPE, ROCSOLVER_ROUTINE) \ - void unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + void unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, \ - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, \ - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, \ + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, \ + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, \ std::int64_t scratchpad_size) { \ unmtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, lda, tau, \ c, ldc, scratchpad, scratchpad_size); \ @@ -1156,26 +1156,26 @@ UNMTR_LAUNCHER(std::complex, rocsolver_zunmtr) // USM APIs template -inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T_A *a, std::int64_t lda, T_B *d, T_B *e, T_A *tauq, - T_A *taup, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event gebrd(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T_A* a, std::int64_t lda, T_B* d, T_B* e, T_A* tauq, + T_A* taup, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tauq_ = reinterpret_cast(tauq); - auto taup_ = reinterpret_cast(taup); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tauq_ = reinterpret_cast(tauq); + auto taup_ = reinterpret_cast(taup); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, d_, e_, tauq_, taup_); @@ -1185,10 +1185,10 @@ inline sycl::event gebrd(const char *func_name, Func func, sycl::queue &queue, s } #define GEBRD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - sycl::event gebrd(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE_A *a, \ - std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tauq, TYPE_A *taup, \ - TYPE_A *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event gebrd(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE_A* a, \ + std::int64_t lda, TYPE_B* d, TYPE_B* e, TYPE_A* tauq, TYPE_A* taup, \ + TYPE_A* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return gebrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, d, e, tauq, taup, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1200,43 +1200,43 @@ GEBRD_LAUNCHER_USM(std::complex, double, rocsolver_zgebrd) #undef GEBRD_LAUNCHER_USM -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } -sycl::event gerqf(sycl::queue &queue, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(sycl::queue& queue, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "gerqf"); } template -inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, T *tau, T *scratchpad, +inline sycl::event geqrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, tau_); }); @@ -1245,9 +1245,9 @@ inline sycl::event geqrf(const char *func_name, Func func, sycl::queue &queue, s } #define GEQRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event geqrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event geqrf(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return geqrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, tau, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1260,10 +1260,10 @@ GEQRF_LAUNCHER_USM(std::complex, rocsolver_zgeqrf) #undef GEQRF_LAUNCHER_USM template -inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, T *a, std::int64_t lda, std::int64_t *ipiv, T *scratchpad, +inline sycl::event getrf(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, T* a, std::int64_t lda, std::int64_t* ipiv, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, scratchpad_size); @@ -1271,19 +1271,19 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s // To get around the limitation. // Allocate memory with 32-bit ints then copy over results std::uint64_t ipiv_size = std::min(n, m); - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto devInfo_ = reinterpret_cast(devInfo); - auto ipiv_ = reinterpret_cast(ipiv32); + auto a_ = reinterpret_cast(a); + auto devInfo_ = reinterpret_cast(devInfo); + auto ipiv_ = reinterpret_cast(ipiv32); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, a_, lda, ipiv_, devInfo_); @@ -1291,7 +1291,7 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s }); // Copy from 32-bit USM to 64-bit - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv[index] = static_cast(ipiv32[index]); @@ -1305,10 +1305,10 @@ inline sycl::event getrf(const char *func_name, Func func, sycl::queue &queue, s } #define GETRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event getrf(sycl::queue &queue, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad, \ + sycl::event getrf(sycl::queue& queue, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t* ipiv, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return getrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, a, lda, ipiv, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1320,33 +1320,33 @@ GETRF_LAUNCHER_USM(std::complex, rocsolver_zgetrf) #undef GETRF_LAUNCHER_USM -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri"); } -sycl::event getri(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri"); } -sycl::event getri(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "getri"); } -sycl::event getri(sycl::queue &queue, std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getri(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "getri"); } template -inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T *a, - std::int64_t lda, std::int64_t *ipiv, T *b, std::int64_t ldb, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event getrs(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, T* a, + std::int64_t lda, std::int64_t* ipiv, T* b, std::int64_t ldb, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); @@ -1354,25 +1354,25 @@ inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, // To get around the limitation. // Create new buffer and convert 64-bit values. std::uint64_t ipiv_size = n; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv32[index] = static_cast(ipiv[index]); }); }); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } cgh.depends_on(done_casting); - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto ipiv_ = reinterpret_cast(ipiv32); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto ipiv_ = reinterpret_cast(ipiv32); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_operation(trans), n, nrhs, a_, lda, ipiv_, b_, ldb); @@ -1387,10 +1387,10 @@ inline sycl::event getrs(const char *func_name, Func func, sycl::queue &queue, } #define GETRS_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event getrs(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, \ - std::int64_t nrhs, TYPE *a, std::int64_t lda, std::int64_t *ipiv, TYPE *b, \ - std::int64_t ldb, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event getrs(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, \ + std::int64_t nrhs, TYPE* a, std::int64_t lda, std::int64_t* ipiv, TYPE* b, \ + std::int64_t ldb, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return getrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, trans, n, nrhs, a, lda, ipiv, \ b, ldb, scratchpad, scratchpad_size, dependencies); \ } @@ -1403,28 +1403,28 @@ GETRS_LAUNCHER_USM(std::complex, rocsolver_zgetrs) #undef GETRS_LAUNCHER_USM template -inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event gesvd(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, - std::int64_t n, T_A *a, std::int64_t lda, T_B *s, T_A *u, std::int64_t ldu, - T_A *vt, std::int64_t ldvt, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, T_A* a, std::int64_t lda, T_B* s, T_A* u, std::int64_t ldu, + T_A* vt, std::int64_t ldvt, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(m, n, lda, ldu, ldvt, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto s_ = reinterpret_cast(s); - auto u_ = reinterpret_cast(u); - auto vt_ = reinterpret_cast(vt); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto s_ = reinterpret_cast(s); + auto u_ = reinterpret_cast(u); + auto vt_ = reinterpret_cast(vt); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_jobsvd(jobu), get_rocsolver_jobsvd(jobvt), m, n, a_, lda, s_, u_, ldu, @@ -1438,11 +1438,11 @@ inline sycl::event gesvd(const char *func_name, Func func, sycl::queue &queue, } #define GESVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - sycl::event gesvd(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ - std::int64_t m, std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *s, \ - TYPE_A *u, std::int64_t ldu, TYPE_A *vt, std::int64_t ldvt, \ - TYPE_A *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event gesvd(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, \ + std::int64_t m, std::int64_t n, TYPE_A* a, std::int64_t lda, TYPE_B* s, \ + TYPE_A* u, std::int64_t ldu, TYPE_A* vt, std::int64_t ldvt, \ + TYPE_A* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return gesvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobu, jobvt, m, n, a, lda, s, \ u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); \ } @@ -1455,25 +1455,25 @@ GESVD_LAUNCHER_USM(std::complex, double, rocsolver_zgesvd) #undef GESVD_LAUNCHER_USM template -inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a, - std::int64_t lda, T_B *&w, T_A *&scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event heevd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A*& a, + std::int64_t lda, T_B*& w, T_A*& scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -1486,10 +1486,10 @@ inline sycl::event heevd(const char *func_name, Func func, sycl::queue &queue, } #define HEEVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - sycl::event heevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ - std::int64_t n, TYPE_A *a, std::int64_t lda, TYPE_B *w, TYPE_A *scratchpad, \ + sycl::event heevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ + std::int64_t n, TYPE_A* a, std::int64_t lda, TYPE_B* w, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return heevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1500,27 +1500,27 @@ HEEVD_LAUNCHER_USM(std::complex, double, rocsolver_zheevd) #undef HEEVD_LAUNCHER_USM template -inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A *&a, - std::int64_t lda, T_A *&b, std::int64_t ldb, T_B *&w, T_A *&scratchpad, +inline sycl::event hegvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T_A*& a, + std::int64_t lda, T_A*& b, std::int64_t ldb, T_B*& w, T_A*& scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype), get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, @@ -1533,11 +1533,11 @@ inline sycl::event hegvd(const char *func_name, Func func, sycl::queue &queue, s } #define HEGVD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - sycl::event hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, std::int64_t lda, \ - TYPE_A *b, std::int64_t ldb, TYPE_B *w, TYPE_A *scratchpad, \ + sycl::event hegvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A* a, std::int64_t lda, \ + TYPE_A* b, std::int64_t ldb, TYPE_B* w, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return hegvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, \ b, ldb, w, scratchpad, scratchpad_size, dependencies); \ } @@ -1548,24 +1548,24 @@ HEGVD_LAUNCHER_USM(std::complex, double, rocsolver_zhegvd) #undef HEGVD_LAUNCHER_USM template -inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T_A *a, std::int64_t lda, T_B *d, - T_B *e, T_A *tau, T_A *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event hetrd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T_A* a, std::int64_t lda, T_B* d, + T_B* e, T_A* tau, T_A* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType_A = typename RocmEquivalentType::Type; using rocmDataType_B = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, d_, e_, tau_); @@ -1575,10 +1575,10 @@ inline sycl::event hetrd(const char *func_name, Func func, sycl::queue &queue, } #define HETRD_LAUNCHER_USM(TYPE_A, TYPE_B, ROCSOLVER_ROUTINE) \ - sycl::event hetrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A *a, \ - std::int64_t lda, TYPE_B *d, TYPE_B *e, TYPE_A *tau, TYPE_A *scratchpad, \ + sycl::event hetrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE_A* a, \ + std::int64_t lda, TYPE_B* d, TYPE_B* e, TYPE_A* tau, TYPE_A* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return hetrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1588,36 +1588,36 @@ HETRD_LAUNCHER_USM(std::complex, double, rocsolver_zhetrd) #undef HETRD_LAUNCHER_USM -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "hetrf"); } -sycl::event hetrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "hetrf"); } template -inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event orgbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - T *a, std::int64_t lda, T *tau, T *scratchpad, + T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m, n, k, a_, lda, tau_); @@ -1627,10 +1627,10 @@ inline sycl::event orgbr(const char *func_name, Func func, sycl::queue &queue, } #define ORGBR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event orgbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, \ - std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, \ + std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, TYPE* tau, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1641,21 +1641,21 @@ ORGBR_LAUNCHER_USM(double, rocsolver_dorgbr) #undef ORGBR_LAUNCHER_USM template -inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event orgqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_); }); @@ -1664,9 +1664,9 @@ inline sycl::event orgqr(const char *func_name, Func func, sycl::queue &queue, s } #define ORGQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event orgqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1677,21 +1677,21 @@ ORGQR_LAUNCHER_USM(double, rocsolver_dorgqr) #undef ORGQR_LAUNCHER_USM template -inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event orgtr(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, tau_); @@ -1701,9 +1701,9 @@ inline sycl::event orgtr(const char *func_name, Func func, sycl::queue &queue, } #define ORGTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event orgtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event orgtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return orgtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1714,24 +1714,24 @@ ORGTR_LAUNCHER_USM(double, rocsolver_dorgtr) #undef ORGTR_LAUNCHER_USM template -inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ormtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, - std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T* a, + std::int64_t lda, T* tau, T* c, std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_fill_mode(uplo), get_rocblas_operation(trans), @@ -1742,11 +1742,11 @@ inline sycl::event ormtr(const char *func_name, Func func, sycl::queue &queue, } #define ORMTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event ormtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event ormtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ormtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, \ lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -1756,37 +1756,37 @@ ORMTR_LAUNCHER_USM(double, rocsolver_dormtr) #undef ORMTR_LAUNCHER_USM -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "ormrq"); } -sycl::event ormrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ormrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "ormrq"); } template -inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ormqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, - std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, T* c, + std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_, @@ -1797,11 +1797,11 @@ inline sycl::event ormqr(const char *func_name, Func func, sycl::queue &queue, } #define ORMQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event ormqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \ - TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event ormqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, \ + TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return ormqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -1812,22 +1812,22 @@ ORMQR_LAUNCHER_USM(double, rocsolver_dormqr) #undef ORMQR_LAUNCHER_USM template -inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event potrf(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto devInfo_ = reinterpret_cast(devInfo); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, devInfo_); @@ -1839,9 +1839,9 @@ inline sycl::event potrf(const char *func_name, Func func, sycl::queue &queue, } #define POTRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event potrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1854,23 +1854,23 @@ POTRF_LAUNCHER_USM(std::complex, rocsolver_zpotrf) #undef POTRF_LAUNCHER_USM template -inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event potri(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto scratch_ = reinterpret_cast(scratchpad); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto scratch_ = reinterpret_cast(scratchpad); + auto devInfo_ = reinterpret_cast(devInfo); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, devInfo_); @@ -1882,9 +1882,9 @@ inline sycl::event potri(const char *func_name, Func func, sycl::queue &queue, } #define POTRI_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event potri(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potri(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potri(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, scratchpad, \ scratchpad_size, dependencies); \ } @@ -1897,22 +1897,22 @@ POTRI_LAUNCHER_USM(std::complex, rocsolver_zpotri) #undef POTRI_LAUNCHER_USM template -inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T *a, - std::int64_t lda, T *b, std::int64_t ldb, T *scratchpad, +inline sycl::event potrs(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, T* a, + std::int64_t lda, T* b, std::int64_t ldb, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, nrhs, lda, ldb, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, nrhs, a_, lda, b_, ldb); @@ -1922,10 +1922,10 @@ inline sycl::event potrs(const char *func_name, Func func, sycl::queue &queue, } #define POTRS_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event potrs(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, \ - std::int64_t nrhs, TYPE *a, std::int64_t lda, TYPE *b, std::int64_t ldb, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event potrs(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, \ + std::int64_t nrhs, TYPE* a, std::int64_t lda, TYPE* b, std::int64_t ldb, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return potrs(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, nrhs, a, lda, b, ldb, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1938,24 +1938,24 @@ POTRS_LAUNCHER_USM(std::complex, rocsolver_zpotrs) #undef POTRS_LAUNCHER_USM template -inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a, - std::int64_t lda, T *w, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event syevd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T* a, + std::int64_t lda, T* w, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto w_ = reinterpret_cast(w); - auto scratch_ = reinterpret_cast(scratchpad); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto w_ = reinterpret_cast(w); + auto scratch_ = reinterpret_cast(scratchpad); + auto devInfo_ = reinterpret_cast(devInfo); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, lda, w_, scratch_, @@ -1968,10 +1968,10 @@ inline sycl::event syevd(const char *func_name, Func func, sycl::queue &queue, } #define SYEVD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event syevd(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ - std::int64_t n, TYPE *a, std::int64_t lda, TYPE *w, TYPE *scratchpad, \ + sycl::event syevd(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, \ + std::int64_t n, TYPE* a, std::int64_t lda, TYPE* w, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return syevd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, jobz, uplo, n, a, lda, w, \ scratchpad, scratchpad_size, dependencies); \ } @@ -1982,26 +1982,26 @@ SYEVD_LAUNCHER_USM(double, rocsolver_dsyevd) #undef SYEVD_LAUNCHER_USM template -inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T *a, - std::int64_t lda, T *b, std::int64_t ldb, T *w, T *scratchpad, +inline sycl::event sygvd(const char* func_name, Func func, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, T* a, + std::int64_t lda, T* b, std::int64_t ldb, T* w, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, ldb, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); - auto done = queue.submit([&](sycl::handler &cgh) { + int* devInfo = (int*)malloc_device(sizeof(int), queue); + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto b_ = reinterpret_cast(b); - auto w_ = reinterpret_cast(w); - auto devInfo_ = reinterpret_cast(devInfo); - auto scratch_ = reinterpret_cast(scratchpad); + auto a_ = reinterpret_cast(a); + auto b_ = reinterpret_cast(b); + auto w_ = reinterpret_cast(w); + auto devInfo_ = reinterpret_cast(devInfo); + auto scratch_ = reinterpret_cast(scratchpad); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocsolver_itype(itype), get_rocsolver_job(jobz), get_rocblas_fill_mode(uplo), n, a_, @@ -2014,10 +2014,10 @@ inline sycl::event sygvd(const char *func_name, Func func, sycl::queue &queue, s } #define SYGVD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, \ - oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, std::int64_t lda, TYPE *b, \ - std::int64_t ldb, TYPE *w, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event sygvd(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, \ + oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, std::int64_t lda, TYPE* b, \ + std::int64_t ldb, TYPE* w, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return sygvd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, itype, jobz, uplo, n, a, lda, \ b, ldb, w, scratchpad, scratchpad_size, dependencies); \ } @@ -2028,23 +2028,23 @@ SYGVD_LAUNCHER_USM(double, rocsolver_dsygvd) #undef SYGVD_LAUNCHER_USM template -inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *d, T *e, - T *tau, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event sytrd(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* d, T* e, + T* tau, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto d_ = reinterpret_cast(d); - auto e_ = reinterpret_cast(e); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto d_ = reinterpret_cast(d); + auto e_ = reinterpret_cast(e); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, d_, e_, tau_); @@ -2054,10 +2054,10 @@ inline sycl::event sytrd(const char *func_name, Func func, sycl::queue &queue, } #define SYTRD_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event sytrd(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *d, TYPE *e, TYPE *tau, TYPE *scratchpad, \ + sycl::event sytrd(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* d, TYPE* e, TYPE* tau, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return sytrd(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, d, e, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2068,30 +2068,30 @@ SYTRD_LAUNCHER_USM(double, rocsolver_dsytrd) #undef SYTRD_LAUNCHER_USM template -inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, - std::int64_t *ipiv, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event sytrf(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, + std::int64_t* ipiv, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - int *devInfo = (int *)malloc_device(sizeof(int), queue); + int* devInfo = (int*)malloc_device(sizeof(int), queue); // rocsolver legacy api does not accept 64-bit ints. // To get around the limitation. // Allocate memory with 32-bit ints then copy over results std::uint64_t ipiv_size = n; - int *ipiv32 = (int *)malloc_device(sizeof(int) * ipiv_size, queue); + int* ipiv32 = (int*)malloc_device(sizeof(int) * ipiv_size, queue); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto ipiv_ = reinterpret_cast(ipiv32); - auto devInfo_ = reinterpret_cast(devInfo); + auto a_ = reinterpret_cast(a); + auto ipiv_ = reinterpret_cast(ipiv32); + auto devInfo_ = reinterpret_cast(devInfo); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, ipiv_, devInfo_); @@ -2099,7 +2099,7 @@ inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, }); // Copy from 32-bit USM to 64-bit - auto done_casting = queue.submit([&](sycl::handler &cgh) { + auto done_casting = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(done); cgh.parallel_for(sycl::range<1>{ ipiv_size }, [=](sycl::id<1> index) { ipiv[index] = static_cast(ipiv32[index]); @@ -2113,10 +2113,10 @@ inline sycl::event sytrf(const char *func_name, Func func, sycl::queue &queue, } #define SYTRF_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event sytrf(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, std::int64_t *ipiv, TYPE *scratchpad, \ + sycl::event sytrf(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, std::int64_t* ipiv, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return sytrf(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, ipiv, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2128,50 +2128,50 @@ SYTRF_LAUNCHER_USM(std::complex, rocsolver_zsytrf) #undef SYTRF_LAUNCHER_USM -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } -sycl::event trtrs(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, +sycl::event trtrs(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "trtrs"); } template -inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event ungbr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, std::int64_t n, std::int64_t k, - T *a, std::int64_t lda, T *tau, T *scratchpad, + T* a, std::int64_t lda, T* tau, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_generate(vec), m, n, k, a_, lda, tau_); @@ -2181,10 +2181,10 @@ inline sycl::event ungbr(const char *func_name, Func func, sycl::queue &queue, } #define UNGBR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event ungbr(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, \ - std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, TYPE *tau, \ - TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungbr(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, \ + std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, TYPE* tau, \ + TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungbr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, vec, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2195,21 +2195,21 @@ UNGBR_LAUNCHER_USM(std::complex, rocsolver_zungbr) #undef UNGBR_LAUNCHER_USM template -inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event ungqr(const char* func_name, Func func, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, k, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, m, n, k, a_, lda, tau_); }); @@ -2218,9 +2218,9 @@ inline sycl::event ungqr(const char *func_name, Func func, sycl::queue &queue, s } #define UNGQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event ungqr(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungqr(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, m, n, k, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2231,21 +2231,21 @@ UNGQR_LAUNCHER_USM(std::complex, rocsolver_zungqr) #undef UNGQR_LAUNCHER_USM template -inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue, - oneapi::mkl::uplo uplo, std::int64_t n, T *a, std::int64_t lda, T *tau, - T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +inline sycl::event ungtr(const char* func_name, Func func, sycl::queue& queue, + oneapi::mkl::uplo uplo, std::int64_t n, T* a, std::int64_t lda, T* tau, + T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_fill_mode(uplo), n, a_, lda, tau_); @@ -2255,9 +2255,9 @@ inline sycl::event ungtr(const char *func_name, Func func, sycl::queue &queue, } #define UNGTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event ungtr(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *scratchpad, std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + sycl::event ungtr(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* scratchpad, std::int64_t scratchpad_size, \ + const std::vector& dependencies) { \ return ungtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, uplo, n, a, lda, tau, \ scratchpad, scratchpad_size, dependencies); \ } @@ -2267,39 +2267,39 @@ UNGTR_LAUNCHER_USM(std::complex, rocsolver_zungtr) #undef UNGTR_LAUNCHER_USM -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "unmrq"); } -sycl::event unmrq(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event unmrq(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { throw unimplemented("lapack", "unmrq"); } template -inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event unmqr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::int64_t k, T *a, std::int64_t lda, T *tau, T *c, - std::int64_t ldc, T *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::int64_t k, T* a, std::int64_t lda, T* tau, T* c, + std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(n, lda, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_operation(trans), m, n, k, a_, lda, tau_, c_, @@ -2310,11 +2310,11 @@ inline sycl::event unmqr(const char *func_name, Func func, sycl::queue &queue, } #define UNMQR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event unmqr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ - std::int64_t m, std::int64_t n, std::int64_t k, TYPE *a, std::int64_t lda, \ - TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event unmqr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, \ + std::int64_t m, std::int64_t n, std::int64_t k, TYPE* a, std::int64_t lda, \ + TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return unmqr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, trans, m, n, k, a, lda, \ tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -2325,24 +2325,24 @@ UNMQR_LAUNCHER_USM(std::complex, rocsolver_zunmqr) #undef UNMQR_LAUNCHER_USM template -inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue, +inline sycl::event unmtr(const char* func_name, Func func, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T *a, - std::int64_t lda, T *tau, T *c, std::int64_t ldc, T *scratchpad, + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, T* a, + std::int64_t lda, T* tau, T* c, std::int64_t ldc, T* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { using rocmDataType = typename RocmEquivalentType::Type; overflow_check(m, n, lda, ldc, scratchpad_size); - auto done = queue.submit([&](sycl::handler &cgh) { + auto done = queue.submit([&](sycl::handler& cgh) { int64_t num_events = dependencies.size(); for (int64_t i = 0; i < num_events; i++) { cgh.depends_on(dependencies[i]); } - onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler &sc) { + onemkl_rocsolver_host_task(cgh, queue, [=](RocsolverScopedContextHandler& sc) { auto handle = sc.get_handle(queue); - auto a_ = reinterpret_cast(a); - auto tau_ = reinterpret_cast(tau); - auto c_ = reinterpret_cast(c); + auto a_ = reinterpret_cast(a); + auto tau_ = reinterpret_cast(tau); + auto c_ = reinterpret_cast(c); rocblas_status err; ROCSOLVER_ERROR_FUNC_T_SYNC(func_name, func, err, handle, get_rocblas_side_mode(side), get_rocblas_fill_mode(uplo), get_rocblas_operation(trans), @@ -2353,11 +2353,11 @@ inline sycl::event unmtr(const char *func_name, Func func, sycl::queue &queue, } #define UNMTR_LAUNCHER_USM(TYPE, ROCSOLVER_ROUTINE) \ - sycl::event unmtr(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ - oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE *a, \ - std::int64_t lda, TYPE *tau, TYPE *c, std::int64_t ldc, TYPE *scratchpad, \ + sycl::event unmtr(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, \ + oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, TYPE* a, \ + std::int64_t lda, TYPE* tau, TYPE* c, std::int64_t ldc, TYPE* scratchpad, \ std::int64_t scratchpad_size, \ - const std::vector &dependencies) { \ + const std::vector& dependencies) { \ return unmtr(#ROCSOLVER_ROUTINE, ROCSOLVER_ROUTINE, queue, side, uplo, trans, m, n, a, \ lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); \ } @@ -2384,22 +2384,22 @@ GEBRD_LAUNCHER_SCRATCH(std::complex) #undef GEBRD_LAUNCHER_SCRATCH template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size(sycl::queue &queue, std::int64_t m, std::int64_t n, +std::int64_t gerqf_scratchpad_size(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } template <> -std::int64_t gerqf_scratchpad_size>(sycl::queue &queue, std::int64_t m, +std::int64_t gerqf_scratchpad_size>(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "gerqf_scratchpad_size"); } @@ -2448,20 +2448,20 @@ GETRF_LAUNCHER_SCRATCH(std::complex) #undef GETRF_LAUNCHER_SCRATCH template <> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) { +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "getri_scratchpad_size"); } template <> -std::int64_t getri_scratchpad_size(sycl::queue &queue, std::int64_t n, std::int64_t lda) { +std::int64_t getri_scratchpad_size(sycl::queue& queue, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "getri_scratchpad_size"); } template <> -std::int64_t getri_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "getri_scratchpad_size"); } template <> -std::int64_t getri_scratchpad_size>(sycl::queue &queue, std::int64_t n, +std::int64_t getri_scratchpad_size>(sycl::queue& queue, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "getri_scratchpad_size"); } @@ -2520,12 +2520,12 @@ HETRD_LAUNCHER_SCRATCH(std::complex) #undef HETRD_LAUNCHER_SCRATCH template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "hetrf_scratchpad_size"); } template <> -std::int64_t hetrf_scratchpad_size>(sycl::queue &queue, oneapi::mkl::uplo uplo, +std::int64_t hetrf_scratchpad_size>(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { throw unimplemented("lapack", "hetrf_scratchpad_size"); } @@ -2568,14 +2568,14 @@ ORGQR_LAUNCHER_SCRATCH(double) #undef ORGQR_LAUNCHER_SCRATCH template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { throw unimplemented("lapack", "ormrq_scratchpad_size"); } template <> -std::int64_t ormrq_scratchpad_size(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t ormrq_scratchpad_size(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2758,7 +2758,7 @@ UNGTR_LAUNCHER_SCRATCH(std::complex) #undef UNGTR_LAUNCHER_SCRATCH template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2766,7 +2766,7 @@ std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, onea throw unimplemented("lapack", "unmrq_scratchpad_size"); } template <> -std::int64_t unmrq_scratchpad_size>(sycl::queue &queue, oneapi::mkl::side side, +std::int64_t unmrq_scratchpad_size>(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, diff --git a/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp b/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp index 480ee9fc3..c3ac21f54 100644 --- a/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp +++ b/src/lapack/backends/rocsolver/rocsolver_scope_handle.cpp @@ -46,7 +46,7 @@ thread_local rocsolver_handle RocsolverScopedContextHandler::handle_ #endif RocsolverScopedContextHandler::RocsolverScopedContextHandler(sycl::queue queue, - sycl::interop_handle &ih) + sycl::interop_handle& ih) : ih(ih), needToRecover_(false) { placedContext_ = new sycl::context(queue.get_context()); @@ -76,8 +76,8 @@ RocsolverScopedContextHandler::~RocsolverScopedContextHandler() noexcept(false) delete placedContext_; } -void ContextCallback(void *userData) { - auto *ptr = static_cast *>(userData); +void ContextCallback(void* userData) { + auto* ptr = static_cast*>(userData); if (!ptr) { return; } @@ -95,7 +95,7 @@ void ContextCallback(void *userData) { } } -rocblas_handle RocsolverScopedContextHandler::get_handle(const sycl::queue &queue) { +rocblas_handle RocsolverScopedContextHandler::get_handle(const sycl::queue& queue) { auto hipDevice = ih.get_native_device(); hipError_t hipErr; hipCtx_t desired; @@ -142,10 +142,10 @@ rocblas_handle RocsolverScopedContextHandler::get_handle(const sycl::queue &queu return handle; } -hipStream_t RocsolverScopedContextHandler::get_stream(const sycl::queue &queue) { +hipStream_t RocsolverScopedContextHandler::get_stream(const sycl::queue& queue) { return sycl::get_native(queue); } -sycl::context RocsolverScopedContextHandler::get_context(const sycl::queue &queue) { +sycl::context RocsolverScopedContextHandler::get_context(const sycl::queue& queue) { return queue.get_context(); } diff --git a/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp b/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp index 1be98a3b9..ed445a1da 100644 --- a/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp +++ b/src/lapack/backends/rocsolver/rocsolver_scope_handle.hpp @@ -52,23 +52,23 @@ namespace rocsolver { class RocsolverScopedContextHandler { hipCtx_t original_; - sycl::context *placedContext_; + sycl::context* placedContext_; bool needToRecover_; - sycl::interop_handle &ih; + sycl::interop_handle& ih; #ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED static thread_local rocsolver_handle handle_helper; #else static thread_local rocsolver_handle handle_helper; #endif - hipStream_t get_stream(const sycl::queue &queue); - sycl::context get_context(const sycl::queue &queue); + hipStream_t get_stream(const sycl::queue& queue); + sycl::context get_context(const sycl::queue& queue); public: - RocsolverScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + RocsolverScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); ~RocsolverScopedContextHandler() noexcept(false); - rocblas_handle get_handle(const sycl::queue &queue); + rocblas_handle get_handle(const sycl::queue& queue); // This is a work-around function for reinterpret_casting the memory. This // will be fixed when SYCL-2020 has been implemented for Pi backend. template diff --git a/src/lapack/backends/rocsolver/rocsolver_task.hpp b/src/lapack/backends/rocsolver/rocsolver_task.hpp index 902b2f080..d9e14f653 100644 --- a/src/lapack/backends/rocsolver/rocsolver_task.hpp +++ b/src/lapack/backends/rocsolver/rocsolver_task.hpp @@ -51,7 +51,7 @@ namespace lapack { namespace rocsolver { template -static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { +static inline void host_task_internal(H& cgh, sycl::queue queue, F f) { cgh.host_task([f, queue](cl::sycl::interop_handle ih) { auto sc = RocsolverScopedContextHandler(queue, ih); f(sc); @@ -59,7 +59,7 @@ static inline void host_task_internal(H &cgh, sycl::queue queue, F f) { } template -static inline void onemkl_rocsolver_host_task(H &cgh, sycl::queue queue, F f) { +static inline void onemkl_rocsolver_host_task(H& cgh, sycl::queue queue, F f) { (void)host_task_internal(cgh, queue, f); } diff --git a/src/lapack/function_table.hpp b/src/lapack/function_table.hpp index e034fe357..dee8b8d8e 100644 --- a/src/lapack/function_table.hpp +++ b/src/lapack/function_table.hpp @@ -32,1808 +32,1808 @@ typedef struct { int version; - void (*cgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*zgebrd_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tauq, - sycl::buffer> &taup, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*sgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*cgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zgerqf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*zgeqrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*zgetrf_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*zgetri_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*zgetrs_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*cgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zgesvd_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, - std::int64_t m, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cheevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zheevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*chegvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + void (*cgebrd_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgebrd_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgebrd_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*zgebrd_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tauq, + sycl::buffer>& taup, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*sgerqf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dgerqf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*cgerqf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zgerqf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cgeqrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgeqrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgeqrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*zgeqrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cgetrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgetrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgetrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*zgetrf_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cgetri_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgetri_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgetri_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*zgetri_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cgetrs_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgetrs_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgetrs_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*zgetrs_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dgesvd_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sgesvd_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*cgesvd_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zgesvd_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, + std::int64_t m, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cheevd_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zheevd_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*chegvd_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zhegvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, + void (*zhegvd_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*chetrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zhetrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &d, sycl::buffer &e, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*chetrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zhetrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*sorgbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dorgbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dorgqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sorgqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sorgtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*dorgtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*sormtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*chetrd_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zhetrd_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& d, sycl::buffer& e, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*chetrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zhetrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*sorgbr_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dorgbr_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dorgqr_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sorgqr_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sorgtr_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*dorgtr_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*sormtr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dormtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + void (*dormtr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*sormrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dormrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dormqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*sormqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &c, - std::int64_t ldc, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*spotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*cpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zpotrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*spotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*cpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zpotri_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*spotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &b, std::int64_t ldb, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*cpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zpotrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*dsyevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*ssyevd_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dsygvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*ssygvd_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &w, sycl::buffer &scratchpad, - std::int64_t scratchpad_size); - void (*dsytrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*ssytrd_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*ssytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*dsytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*csytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zsytrf_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*ctrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*sormrq_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dormrq_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dormqr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*sormqr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& c, + std::int64_t ldc, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*spotrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dpotrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*cpotrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zpotrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*spotri_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dpotri_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*cpotri_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zpotri_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*spotrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dpotrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& b, std::int64_t ldb, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*cpotrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zpotrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*dsyevd_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*ssyevd_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dsygvd_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*ssygvd_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& w, sycl::buffer& scratchpad, + std::int64_t scratchpad_size); + void (*dsytrd_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*ssytrd_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*ssytrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*dsytrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*csytrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zsytrf_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*ctrtrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*dtrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + void (*dtrtrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*strtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + void (*strtrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*ztrtrs_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, + void (*ztrtrs_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cungbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zungbr_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cungqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zungqr_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cungtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*zungtr_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - void (*cunmrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cungbr_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zungbr_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cungqr_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zungqr_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cungtr_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*zungtr_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + void (*cunmrq_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zunmrq_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + void (*zunmrq_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*cunmqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + void (*cunmqr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zunmqr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, + void (*zunmqr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*cunmtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + void (*cunmtr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zunmtr_sycl)(sycl::queue &queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, + void (*zunmtr_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, - sycl::buffer> &c, std::int64_t ldc, - sycl::buffer> &scratchpad, - std::int64_t scratchpad_size); - sycl::event (*cgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *d, double *e, double *tauq, - double *taup, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgebrd_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, + sycl::buffer>& c, std::int64_t ldc, + sycl::buffer>& scratchpad, + std::int64_t scratchpad_size); + sycl::event (*cgebrd_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgebrd_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* d, double* e, double* tauq, + double* taup, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sgebrd_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zgebrd_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sgerqf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dgerqf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*cgerqf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgerqf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zgerqf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*cgeqrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dgeqrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *tau, float *scratchpad, + const std::vector& dependencies); + sycl::event (*sgeqrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgeqrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zgeqrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies); + sycl::event (*cgetrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgetrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetrf_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, double *a, std::int64_t lda, - std::int64_t *ipiv, double *scratchpad, + const std::vector& dependencies); + sycl::event (*zgetrf_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cgetri_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgetri_usm_sycl)(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, + std::int64_t* ipiv, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, float *a, std::int64_t lda, - std::int64_t *ipiv, float *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetri_usm_sycl)(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, + std::int64_t* ipiv, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetri_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex *a, - std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t *ipiv, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t *ipiv, float *b, std::int64_t ldb, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + const std::vector& dependencies); + sycl::event (*zgetri_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex* a, + std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cgetrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgetrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t* ipiv, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sgetrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t* ipiv, float* b, std::int64_t ldb, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zgetrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgesvd_usm_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *s, double *u, - std::int64_t ldu, double *vt, std::int64_t ldvt, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + double* a, std::int64_t lda, double* s, double* u, + std::int64_t ldu, double* vt, std::int64_t ldvt, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sgesvd_usm_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *s, float *u, std::int64_t ldu, - float *vt, std::int64_t ldvt, float *scratchpad, + float* a, std::int64_t lda, float* s, float* u, std::int64_t ldu, + float* vt, std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + const std::vector& dependencies); + sycl::event (*cgesvd_usm_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *s, - std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgesvd_usm_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + std::complex* a, std::int64_t lda, float* s, + std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zgesvd_usm_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, - std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cheevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, float *w, std::complex *scratchpad, + std::complex* a, std::int64_t lda, double* s, + std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cheevd_usm_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zheevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, double *w, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zheevd_usm_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*chegvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - float *w, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*chegvd_usm_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zhegvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - double *w, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zhegvd_usm_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*chetrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*chetrd_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zhetrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zhetrd_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*chetrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zhetrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sorgbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dorgbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dorgqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sorgqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sorgtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, + const std::vector& dependencies); + sycl::event (*chetrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zhetrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sorgbr_usm_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dorgbr_usm_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dorgqr_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sorgqr_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sorgtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dorgtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dorgtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sormtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*sormtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, float *a, std::int64_t lda, - float *tau, float *c, std::int64_t ldc, float *scratchpad, + std::int64_t m, std::int64_t n, float* a, std::int64_t lda, + float* tau, float* c, std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dormtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*dormtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, double *a, std::int64_t lda, - double *tau, double *c, std::int64_t ldc, double *scratchpad, + std::int64_t m, std::int64_t n, double* a, std::int64_t lda, + double* tau, double* c, std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sormrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*sormrq_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dormrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*dormrq_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *c, std::int64_t ldc, double *scratchpad, + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* c, std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dormqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*dormqr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, - double *c, std::int64_t ldc, double *scratchpad, + std::int64_t k, double* a, std::int64_t lda, double* tau, + double* c, std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sormqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*sormqr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, + const std::vector& dependencies); + sycl::event (*spotrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dpotrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *scratchpad, + const std::vector& dependencies); + sycl::event (*cpotrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zpotrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*spotri_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dpotri_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotri_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, + const std::vector& dependencies); + sycl::event (*cpotri_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zpotri_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*spotrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dpotrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dsyevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *w, double *scratchpad, + const std::vector& dependencies); + sycl::event (*cpotrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zpotrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dsyevd_usm_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* w, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ssyevd_usm_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *w, float *scratchpad, + const std::vector& dependencies); + sycl::event (*ssyevd_usm_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* w, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dsygvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ssygvd_usm_sycl)(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dsytrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ssytrd_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ssytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dsytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*csytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zsytrf_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ctrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies); + sycl::event (*dsygvd_usm_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*ssygvd_usm_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dsytrd_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*ssytrd_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*ssytrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dsytrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*csytrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zsytrf_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*ctrtrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dtrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dtrtrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - double *b, std::int64_t ldb, double *scratchpad, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + double* b, std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*strtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies); + sycl::event (*strtrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - float *b, std::int64_t ldb, float *scratchpad, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*ztrtrs_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + const std::vector& dependencies); + sycl::event (*ztrtrs_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cungbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zungbr_usm_sycl)(sycl::queue &queue, oneapi::mkl::generate vec, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cungqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cungbr_usm_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zungbr_usm_sycl)(sycl::queue& queue, oneapi::mkl::generate vec, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cungqr_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zungqr_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zungqr_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cungtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*cungtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zungtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zungtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cunmrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*cunmrq_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zunmrq_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*zunmrq_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cunmqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*cunmqr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zunmqr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*zunmqr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, - std::int64_t ldc, std::complex *scratchpad, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, + std::int64_t ldc, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cunmtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + const std::vector& dependencies); + sycl::event (*cunmtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zunmtr_usm_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*zunmtr_usm_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, - std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - void (*sgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, + std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + void (*sgeqrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*dgeqrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*cgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + void (*cgeqrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zgeqrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + void (*zgeqrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*sgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &a, + void (*sgetri_batch_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, sycl::buffer &a, + void (*dgetri_batch_sycl)(sycl::queue& queue, std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*cgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + void (*cgetri_batch_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zgetri_batch_sycl)(sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + void (*zgetri_batch_sycl)(sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*sgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, + void (*sgetrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*dgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer &b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*dgetrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*cgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*cgetrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zgetrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + void (*zgetrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*sgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*sgetrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*dgetrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*cgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + void (*cgetrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zgetrf_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, + void (*zgetrf_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*sorgqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*sorgqr_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dorgqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*dorgqr_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*spotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*spotrf_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*dpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, + void (*dpotrf_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size); - void (*cpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + void (*cpotrf_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zpotrf_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, + void (*zpotrf_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*spotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + void (*spotrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*dpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*dpotrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size); - void (*cpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + sycl::buffer& scratchpad, std::int64_t scratchpad_size); + void (*cpotrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zpotrs_batch_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, + void (*zpotrs_batch_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &b, std::int64_t ldb, + sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*cungqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + void (*cungqr_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - void (*zungqr_batch_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, + void (*zungqr_batch_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, + sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size); - sycl::event (*sgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + sycl::event (*sgeqrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dgeqrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies); + sycl::event (*cgeqrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgeqrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies); + sycl::event (*zgeqrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dgetrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies); + sycl::event (*cgetrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetrf_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies); + sycl::event (*zgetrf_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, float *a, + const std::vector& dependencies); + sycl::event (*sgetri_batch_usm_sycl)(sycl::queue& queue, std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, float *scratchpad, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, double *a, + const std::vector& dependencies); + sycl::event (*dgetri_batch_usm_sycl)(sycl::queue& queue, std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, double *scratchpad, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, std::complex *a, + const std::vector& dependencies); + sycl::event (*cgetri_batch_usm_sycl)(sycl::queue& queue, std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, std::complex *scratchpad, + std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetri_batch_usm_sycl)(sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, + const std::vector& dependencies); + sycl::event (*zgetri_batch_usm_sycl)(sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, + const std::vector& dependencies); + sycl::event (*sgetrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, float *b, + std::int64_t* ipiv, std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, + const std::vector& dependencies); + sycl::event (*dgetrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, double *b, + std::int64_t* ipiv, std::int64_t stride_ipiv, double* b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*cgetrs_batch_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); sycl::event (*zgetrs_batch_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::complex *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sorgqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sorgqr_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dorgqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, + const std::vector& dependencies); + sycl::event (*dorgqr_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*spotrf_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dpotrf_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*cpotrf_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotrf_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, + const std::vector& dependencies); + sycl::event (*zpotrf_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, + const std::vector& dependencies); + sycl::event (*spotrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dpotrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cpotrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotrs_batch_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, std::complex *a, + const std::vector& dependencies); + sycl::event (*zpotrs_batch_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, std::complex* a, std::int64_t lda, std::int64_t stride_a, - std::complex *b, std::int64_t ldb, + std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cungqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies); + sycl::event (*cungqr_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zungqr_batch_usm_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, + const std::vector& dependencies); + sycl::event (*zungqr_batch_usm_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, std::int64_t batch_size, - std::complex *scratchpad, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*sgeqrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgeqrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cgeqrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgeqrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zgeqrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgetrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cgetrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetrf_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zgetrf_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetri_group_usm_sycl)(sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dgetri_group_usm_sycl)(sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cgetri_group_usm_sycl)(sycl::queue& queue, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zgetri_group_usm_sycl)(sycl::queue &queue, std::int64_t *n, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zgetri_group_usm_sycl)(sycl::queue& queue, std::int64_t* n, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, + const std::vector& dependencies); + sycl::event (*sgetrs_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + const std::vector& dependencies); + sycl::event (*dgetrs_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cgetrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::transpose *trans, - std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*cgetrs_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::transpose* trans, + std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); + const std::vector& dependencies); sycl::event (*zgetrs_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, std::complex **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*sorgqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dorgqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, double **a, std::int64_t *lda, - double **tau, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, std::complex** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*sorgqr_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dorgqr_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, double** a, std::int64_t* lda, + double** tau, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, float **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, double **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*cpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*spotrf_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, float** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dpotrf_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, double** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*cpotrf_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zpotrf_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::complex **a, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zpotrf_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::complex** a, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*spotrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, float **a, - std::int64_t *lda, float **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*dpotrs_group_usm_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, double **a, - std::int64_t *lda, double **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); + const std::vector& dependencies); + sycl::event (*spotrs_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, float** a, + std::int64_t* lda, float** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); + sycl::event (*dpotrs_group_usm_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, double** a, + std::int64_t* lda, double** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies); sycl::event (*cpotrs_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies); + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies); sycl::event (*zpotrs_group_usm_sycl)( - sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies); - sycl::event (*cungqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies); + sycl::event (*cungqr_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); - sycl::event (*zungqr_group_usm_sycl)(sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, + const std::vector& dependencies); + sycl::event (*zungqr_group_usm_sycl)(sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies); + const std::vector& dependencies); - std::int64_t (*sgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*sgebrd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*dgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*dgebrd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*cgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*cgebrd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*zgebrd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*zgebrd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*sgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*sgerqf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*dgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*dgerqf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*cgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*cgerqf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*zgerqf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*zgerqf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*sgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*sgeqrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*dgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*dgeqrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*cgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*cgeqrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*zgeqrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*zgeqrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*sgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + std::int64_t (*sgesvd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); - std::int64_t (*dgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + std::int64_t (*dgesvd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); - std::int64_t (*cgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + std::int64_t (*cgesvd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); - std::int64_t (*zgesvd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::jobsvd jobu, + std::int64_t (*zgesvd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt); - std::int64_t (*sgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*sgetrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*dgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*dgetrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*cgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*cgetrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*zgetrf_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*zgetrf_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda); - std::int64_t (*sgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*sgetri_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda); - std::int64_t (*dgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*dgetri_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda); - std::int64_t (*cgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*cgetri_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda); - std::int64_t (*zgetri_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*zgetri_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda); - std::int64_t (*sgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t (*sgetrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*dgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t (*dgetrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*cgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t (*cgetrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*zgetrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::transpose trans, + std::int64_t (*zgetrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*cheevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, + std::int64_t (*cheevd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zheevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, + std::int64_t (*zheevd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*chegvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype, + std::int64_t (*chegvd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); - std::int64_t (*zhegvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype, + std::int64_t (*zhegvd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); - std::int64_t (*chetrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*chetrd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zhetrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zhetrd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*chetrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*chetrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zhetrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zhetrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*sorgbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect, + std::int64_t (*sorgbr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*dorgbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect, + std::int64_t (*dorgbr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*sorgtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*sorgtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dorgtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dorgtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*sorgqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*sorgqr_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*dorgqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*dorgqr_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*sormrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*sormrq_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*dormrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*dormrq_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*sormqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*sormqr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*dormqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*dormqr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*sormtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*sormtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); - std::int64_t (*dormtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*dormtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); - std::int64_t (*spotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*spotrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dpotrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*cpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cpotrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zpotrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zpotrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*spotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*spotrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*dpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dpotrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*cpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cpotrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*zpotrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zpotrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*spotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*spotri_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dpotri_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*cpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cpotri_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zpotri_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zpotri_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*ssytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*ssytrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dsytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dsytrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*csytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*csytrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zsytrf_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zsytrf_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*ssyevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, + std::int64_t (*ssyevd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dsyevd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::job jobz, + std::int64_t (*dsyevd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*ssygvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype, + std::int64_t (*ssygvd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); - std::int64_t (*dsygvd_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t itype, + std::int64_t (*dsygvd_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb); - std::int64_t (*ssytrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*ssytrd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*dsytrd_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dsytrd_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*strtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*strtrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*dtrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dtrtrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*ctrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*ctrtrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*ztrtrs_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*ztrtrs_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb); - std::int64_t (*cungbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect, + std::int64_t (*cungbr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*zungbr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::generate vect, + std::int64_t (*zungbr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*cungqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*cungqr_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*zungqr_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, std::int64_t n, + std::int64_t (*zungqr_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda); - std::int64_t (*cungtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cungtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*zungtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zungtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda); - std::int64_t (*cunmrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*cunmrq_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*zunmrq_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*zunmrq_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*cunmqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*cunmqr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*zunmqr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*zunmqr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc); - std::int64_t (*cunmtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*cunmtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); - std::int64_t (*zunmtr_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::side side, + std::int64_t (*zunmtr_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc); - std::int64_t (*sgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*sgetrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*dgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*dgetrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*cgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*cgetrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*zgetrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*zgetrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*sgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*sgetri_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*dgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*dgetri_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*cgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*cgetri_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); - std::int64_t (*zgetri_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t n, + std::int64_t (*zgetri_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size); std::int64_t (*sgetrs_batch_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); std::int64_t (*dgetrs_batch_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); std::int64_t (*cgetrs_batch_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); std::int64_t (*zgetrs_batch_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - std::int64_t (*sgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*sgeqrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*dgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*dgeqrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*cgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*cgeqrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*zgeqrf_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*zgeqrf_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*spotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*spotrf_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); - std::int64_t (*dpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dpotrf_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); - std::int64_t (*cpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cpotrf_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); - std::int64_t (*zpotrf_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zpotrf_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size); - std::int64_t (*spotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*spotrs_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - std::int64_t (*dpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*dpotrs_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - std::int64_t (*cpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*cpotrs_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - std::int64_t (*zpotrs_batch_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo uplo, + std::int64_t (*zpotrs_batch_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size); - std::int64_t (*sorgqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*sorgqr_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*dorgqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*dorgqr_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*cungqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*cungqr_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*zungqr_batch_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t m, + std::int64_t (*zungqr_batch_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size); - std::int64_t (*sgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t (*sgetrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*dgetrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*cgetrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zgetrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*zgetrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*sgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zgetri_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); + std::int64_t (*sgetri_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*dgetri_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*cgetri_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*zgetri_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); std::int64_t (*sgetrs_group_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); std::int64_t (*dgetrs_group_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); std::int64_t (*cgetrs_group_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); std::int64_t (*zgetrs_group_scratchpad_size_sycl)( - sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes); - std::int64_t (*sgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes); + std::int64_t (*sgeqrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*dgeqrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*cgeqrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zgeqrf_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*zgeqrf_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*sorgqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dorgqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*spotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*sorgqr_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*dorgqr_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*spotrf_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*dpotrf_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*cpotrf_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zpotrf_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *lda, + std::int64_t* group_sizes); + std::int64_t (*zpotrf_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*spotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, + std::int64_t* group_sizes); + std::int64_t (*spotrs_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*dpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, + std::int64_t* group_sizes); + std::int64_t (*dpotrs_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, + std::int64_t* group_sizes); + std::int64_t (*cpotrs_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zpotrs_group_scratchpad_size_sycl)(sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, - std::int64_t *lda, std::int64_t *ldb, + std::int64_t* group_sizes); + std::int64_t (*zpotrs_group_scratchpad_size_sycl)(sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, + std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*cungqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); - std::int64_t (*zungqr_group_scratchpad_size_sycl)(sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes); + std::int64_t* group_sizes); + std::int64_t (*cungqr_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); + std::int64_t (*zungqr_group_scratchpad_size_sycl)(sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes); } lapack_function_table_t; diff --git a/src/lapack/lapack_loader.cpp b/src/lapack/lapack_loader.cpp index 43fe349d1..81e8ce729 100644 --- a/src/lapack/lapack_loader.cpp +++ b/src/lapack/lapack_loader.cpp @@ -30,2001 +30,2001 @@ namespace detail { static oneapi::mkl::detail::table_initializer function_tables; -void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tauq, - sycl::buffer> &taup, sycl::buffer> &scratchpad, +void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tauq, + sycl::buffer>& taup, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tauq, sycl::buffer &taup, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tauq, sycl::buffer& taup, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tauq, sycl::buffer &taup, sycl::buffer &scratchpad, +void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tauq, sycl::buffer& taup, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tauq, - sycl::buffer> &taup, sycl::buffer> &scratchpad, +void gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tauq, + sycl::buffer>& taup, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgebrd_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size); } -void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgerqf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgeqrf_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size); } -void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetrf_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer &scratchpad, +void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetri_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, sycl::buffer> &b, - std::int64_t ldb, sycl::buffer> &scratchpad, +void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, sycl::buffer>& b, + std::int64_t ldb, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetrs_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size); } -void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, +void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &s, sycl::buffer &u, std::int64_t ldu, - sycl::buffer &vt, std::int64_t ldvt, sycl::buffer &scratchpad, +void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& s, sycl::buffer& u, std::int64_t ldu, + sycl::buffer& vt, std::int64_t ldvt, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +void gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &s, - sycl::buffer> &u, std::int64_t ldu, - sycl::buffer> &vt, std::int64_t ldvt, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, sycl::buffer& s, + sycl::buffer>& u, std::int64_t ldu, + sycl::buffer>& vt, std::int64_t ldvt, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgesvd_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size); } -void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &w, sycl::buffer> &scratchpad, +void heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& w, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cheevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer> &a, - std::int64_t lda, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer>& a, + std::int64_t lda, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zheevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +void hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].chegvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +void hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zhegvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].chetrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zhetrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].chetrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zhetrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, std::int64_t lda, - sycl::buffer &tau, sycl::buffer &scratchpad, +void orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, std::int64_t lda, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sorgbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer &a, - std::int64_t lda, sycl::buffer &tau, sycl::buffer &scratchpad, +void orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer& a, + std::int64_t lda, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dorgbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dorgqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sorgqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sorgtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dorgtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sormtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dormtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sormrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dormrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dormqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer &a, std::int64_t lda, sycl::buffer &tau, - sycl::buffer &c, std::int64_t ldc, sycl::buffer &scratchpad, + sycl::buffer& a, std::int64_t lda, sycl::buffer& tau, + sycl::buffer& c, std::int64_t ldc, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sormqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].spotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zpotrf_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].spotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &scratchpad, +void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zpotri_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size); } -void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].spotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, sycl::buffer &b, - std::int64_t ldb, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, sycl::buffer& b, + std::int64_t ldb, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - std::int64_t nrhs, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + std::int64_t nrhs, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zpotrs_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, +void syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dsyevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, std::int64_t lda, - sycl::buffer &w, sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, std::int64_t lda, + sycl::buffer& w, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ssyevd_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size); } -void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dsygvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer &a, - std::int64_t lda, sycl::buffer &b, std::int64_t ldb, sycl::buffer &w, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, sycl::buffer& a, + std::int64_t lda, sycl::buffer& b, std::int64_t ldb, sycl::buffer& w, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ssygvd_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size); } -void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, - sycl::buffer &e, sycl::buffer &tau, sycl::buffer &scratchpad, +void sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, + sycl::buffer& e, sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dsytrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &d, sycl::buffer &e, - sycl::buffer &tau, sycl::buffer &scratchpad, +void sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& d, sycl::buffer& e, + sycl::buffer& tau, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ssytrd_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size); } -void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ssytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dsytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, sycl::buffer &ipiv, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, sycl::buffer& ipiv, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].csytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer &ipiv, sycl::buffer> &scratchpad, +void sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer& ipiv, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zsytrf_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size); } -void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ctrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dtrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer &a, std::int64_t lda, sycl::buffer &b, std::int64_t ldb, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer& a, std::int64_t lda, sycl::buffer& b, std::int64_t ldb, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].strtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +void trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &b, std::int64_t ldb, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& b, std::int64_t ldb, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].ztrtrs_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size); } -void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cungbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer> &a, - std::int64_t lda, sycl::buffer> &tau, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { +void ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, sycl::buffer>& a, + std::int64_t lda, sycl::buffer>& tau, + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zungbr_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cungqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zungqr_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size); } -void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cungtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &scratchpad, +void ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zungtr_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size); } -void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cunmrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zunmrq_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cunmqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zunmqr_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cunmtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -void unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +void unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, - sycl::buffer> &tau, sycl::buffer> &c, - std::int64_t ldc, sycl::buffer> &scratchpad, + sycl::buffer>& a, std::int64_t lda, + sycl::buffer>& tau, sycl::buffer>& c, + std::int64_t ldc, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zunmtr_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size); } -sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *d, double *e, double *tauq, double *taup, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, double* d, double* e, double* tauq, double* taup, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *d, float *e, float *tauq, float *taup, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, float* d, float* e, float* tauq, float* taup, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tauq, std::complex *taup, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gebrd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tauq, std::complex* taup, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgebrd_usm_sycl(queue, m, n, a, lda, d, e, tauq, taup, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gerqf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgerqf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgeqrf_usm_sycl(queue, m, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - double *a, std::int64_t lda, std::int64_t *ipiv, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + double* a, std::int64_t lda, std::int64_t* ipiv, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetrf_usm_sycl(queue, m, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t *ipiv, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t* ipiv, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t *ipiv, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t* ipiv, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getri(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetri_usm_sycl(queue, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t *ipiv, double *b, std::int64_t ldb, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t* ipiv, double* b, std::int64_t ldb, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, std::int64_t *ipiv, - float *b, std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, std::int64_t* ipiv, + float* b, std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t *ipiv, std::complex *b, std::int64_t ldb, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t* ipiv, std::complex* b, std::int64_t ldb, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetrs_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, double *a, - std::int64_t lda, double *s, double *u, std::int64_t ldu, double *vt, - std::int64_t ldvt, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, double* a, + std::int64_t lda, double* s, double* u, std::int64_t ldu, double* vt, + std::int64_t ldvt, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float *a, - std::int64_t lda, float *s, float *u, std::int64_t ldu, float *vt, - std::int64_t ldvt, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, float* a, + std::int64_t lda, float* s, float* u, std::int64_t ldu, float* vt, + std::int64_t ldvt, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, - oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::complex *a, - std::int64_t lda, float *s, std::complex *u, std::int64_t ldu, - std::complex *vt, std::int64_t ldvt, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, + oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::complex* a, + std::int64_t lda, float* s, std::complex* u, std::int64_t ldu, + std::complex* vt, std::int64_t ldvt, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::jobsvd jobu, +sycl::event gesvd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, - std::complex *a, std::int64_t lda, double *s, std::complex *u, - std::int64_t ldu, std::complex *vt, std::int64_t ldvt, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* a, std::int64_t lda, double* s, std::complex* u, + std::int64_t ldu, std::complex* vt, std::int64_t ldvt, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgesvd_usm_sycl(queue, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt, scratchpad, scratchpad_size, dependencies); } -sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - float *w, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + float* w, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cheevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event heevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, std::complex *a, std::int64_t lda, - double *w, std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event heevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, std::complex* a, std::int64_t lda, + double* w, std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zheevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, float *w, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, float* w, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].chegvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, +sycl::event hegvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, - std::complex *a, std::int64_t lda, std::complex *b, - std::int64_t ldb, double *w, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* b, + std::int64_t ldb, double* w, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zhegvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, float *d, float *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, float* d, float* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].chetrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, double *d, double *e, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event hetrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, double* d, double* e, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zhetrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].chetrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event hetrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zhetrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - float *tau, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + float* tau, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sorgbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - double *tau, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + double* tau, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dorgbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dorgqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sorgqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *tau, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* tau, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].sorgtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *tau, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event orgtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* tau, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dorgtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, float *tau, float *c, - std::int64_t ldc, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, float* a, std::int64_t lda, float* tau, float* c, + std::int64_t ldc, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sormtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, double *tau, double *c, - std::int64_t ldc, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, double* a, std::int64_t lda, double* tau, double* c, + std::int64_t ldc, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dormtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sormrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + double* a, std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dormrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - double *a, std::int64_t lda, double *tau, double *c, std::int64_t ldc, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + double* a, std::int64_t lda, double* tau, double* c, std::int64_t ldc, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dormqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event ormqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - float *a, std::int64_t lda, float *tau, float *c, std::int64_t ldc, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + float* a, std::int64_t lda, float* tau, float* c, std::int64_t ldc, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sormqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].spotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotrf_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].spotri_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potri(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potri(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotri_usm_sycl(queue, uplo, n, a, lda, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, float *b, - std::int64_t ldb, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, float* b, + std::int64_t ldb, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].spotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, double *b, - std::int64_t ldb, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, double* b, + std::int64_t ldb, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event potrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zpotrs_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, double *a, std::int64_t lda, double *w, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, double* a, std::int64_t lda, double* w, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dsyevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event syevd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::job jobz, - oneapi::mkl::uplo uplo, std::int64_t n, float *a, std::int64_t lda, float *w, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event syevd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, + oneapi::mkl::uplo uplo, std::int64_t n, float* a, std::int64_t lda, float* w, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].ssyevd_usm_sycl(queue, jobz, uplo, n, a, lda, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, double *a, - std::int64_t lda, double *b, std::int64_t ldb, double *w, double *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, double* a, + std::int64_t lda, double* b, std::int64_t ldb, double* w, double* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].dsygvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t itype, - oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, float *a, - std::int64_t lda, float *b, std::int64_t ldb, float *w, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sygvd(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, + oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, float* a, + std::int64_t lda, float* b, std::int64_t ldb, float* w, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].ssygvd_usm_sycl(queue, itype, jobz, uplo, n, a, lda, b, ldb, w, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, double *d, double *e, double *tau, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, double* d, double* e, double* tau, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dsytrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, float *d, float *e, float *tau, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrd(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, float* d, float* e, float* tau, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].ssytrd_usm_sycl(queue, uplo, n, a, lda, d, e, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, std::int64_t *ipiv, float *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, std::int64_t* ipiv, float* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].ssytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, std::int64_t *ipiv, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, std::int64_t* ipiv, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dsytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].csytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, std::int64_t *ipiv, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event sytrf(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t* ipiv, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zsytrf_usm_sycl(queue, uplo, n, a, lda, ipiv, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].ctrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, double *a, std::int64_t lda, double *b, std::int64_t ldb, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t nrhs, double* a, std::int64_t lda, double* b, std::int64_t ldb, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dtrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, float *a, std::int64_t lda, float *b, std::int64_t ldb, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t nrhs, float* a, std::int64_t lda, float* b, std::int64_t ldb, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].strtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, +sycl::event trtrs(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, - std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::complex *b, std::int64_t ldb, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::complex* b, std::int64_t ldb, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].ztrtrs_usm_sycl(queue, uplo, trans, diag, n, nrhs, a, lda, b, ldb, scratchpad, scratchpad_size, dependencies); } -sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cungbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::generate vec, - std::int64_t m, std::int64_t n, std::int64_t k, std::complex *a, - std::int64_t lda, std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungbr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vec, + std::int64_t m, std::int64_t n, std::int64_t k, std::complex* a, + std::int64_t lda, std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zungbr_usm_sycl(queue, vec, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cungqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungqr(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zungqr_usm_sycl(queue, m, n, k, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cungtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { +sycl::event ungtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zungtr_usm_sycl(queue, uplo, n, a, lda, tau, scratchpad, scratchpad_size, dependencies); } -sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cunmrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmrq(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zunmrq_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].cunmqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmqr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, - std::complex *a, std::int64_t lda, std::complex *tau, - std::complex *c, std::int64_t ldc, std::complex *scratchpad, - std::int64_t scratchpad_size, const std::vector &dependencies) { + std::complex* a, std::int64_t lda, std::complex* tau, + std::complex* c, std::int64_t ldc, std::complex* scratchpad, + std::int64_t scratchpad_size, const std::vector& dependencies) { return function_tables[libkey].zunmqr_usm_sycl(queue, side, trans, m, n, k, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cunmtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::side side, +sycl::event unmtr(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::complex *tau, std::complex *c, std::int64_t ldc, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::int64_t n, std::complex* a, std::int64_t lda, + std::complex* tau, std::complex* c, std::int64_t ldc, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zunmtr_usm_sycl(queue, side, uplo, trans, m, n, a, lda, tau, c, ldc, scratchpad, scratchpad_size, dependencies); } -void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer> &tau, std::int64_t stride_tau, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer>& tau, std::int64_t stride_tau, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgeqrf_batch_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetri_batch_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &ipiv, std::int64_t stride_ipiv, - sycl::buffer &b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& ipiv, std::int64_t stride_ipiv, + sycl::buffer& b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, std::int64_t ldb, +void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer &ipiv, - std::int64_t stride_ipiv, sycl::buffer> &b, std::int64_t ldb, +void getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer& ipiv, + std::int64_t stride_ipiv, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetrs_batch_sycl(queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer &scratchpad, +void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - sycl::buffer> &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, sycl::buffer> &scratchpad, +void getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zgetrf_batch_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size); } -void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].sorgqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - sycl::buffer &tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer &scratchpad, std::int64_t scratchpad_size) { +void orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + sycl::buffer& tau, std::int64_t stride_tau, std::int64_t batch_size, + sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dorgqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].spotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer &a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, sycl::buffer &scratchpad, +void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer& a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, +void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, sycl::buffer> &a, std::int64_t lda, +void potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, sycl::buffer>& a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zpotrf_batch_sycl(queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].spotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer &b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, sycl::buffer &scratchpad, +void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer& b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, sycl::buffer& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].dpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer> &b, +void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, sycl::buffer> &a, - std::int64_t lda, std::int64_t stride_a, sycl::buffer> &b, +void potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, sycl::buffer>& a, + std::int64_t lda, std::int64_t stride_a, sycl::buffer>& b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zpotrs_batch_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size); } -void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +void ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].cungqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -void ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, - std::int64_t k, sycl::buffer> &a, std::int64_t lda, - std::int64_t stride_a, sycl::buffer> &tau, +void ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, + std::int64_t k, sycl::buffer>& a, std::int64_t lda, + std::int64_t stride_a, sycl::buffer>& tau, std::int64_t stride_tau, std::int64_t batch_size, - sycl::buffer> &scratchpad, std::int64_t scratchpad_size) { + sycl::buffer>& scratchpad, std::int64_t scratchpad_size) { function_tables[libkey].zungqr_batch_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a, - float *tau, std::int64_t stride_tau, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, + float* tau, std::int64_t stride_tau, std::int64_t batch_size, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a, - double *tau, std::int64_t stride_tau, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, + double* tau, std::int64_t stride_tau, std::int64_t batch_size, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zgeqrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::int64_t *ipiv, std::int64_t stride_ipiv, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::int64_t* ipiv, std::int64_t stride_ipiv, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zgetrf_batch_usm_sycl(queue, m, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, float *scratchpad, +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].sgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, std::int64_t batch_size, double *scratchpad, +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].dgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::int64_t batch_size, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetri_batch_usm_sycl(queue, n, a, lda, stride_a, ipiv, stride_ipiv, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, float *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, float* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgetrs_batch_usm_sycl( queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, double *a, - std::int64_t lda, std::int64_t stride_a, std::int64_t *ipiv, - std::int64_t stride_ipiv, double *b, std::int64_t ldb, - std::int64_t stride_b, std::int64_t batch_size, double *scratchpad, +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, double* a, + std::int64_t lda, std::int64_t stride_a, std::int64_t* ipiv, + std::int64_t stride_ipiv, double* b, std::int64_t ldb, + std::int64_t stride_b, std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].dgetrs_batch_usm_sycl( queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetrs_batch_usm_sycl( queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, - std::complex *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t *ipiv, std::int64_t stride_ipiv, std::complex *b, + std::complex* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t* ipiv, std::int64_t stride_ipiv, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetrs_batch_usm_sycl( queue, trans, n, nrhs, a, lda, stride_a, ipiv, stride_ipiv, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, float *a, std::int64_t lda, - std::int64_t stride_a, float *tau, std::int64_t stride_tau, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, float* a, std::int64_t lda, + std::int64_t stride_a, float* tau, std::int64_t stride_tau, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sorgqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, double *a, std::int64_t lda, - std::int64_t stride_a, double *tau, std::int64_t stride_tau, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, double* a, std::int64_t lda, + std::int64_t stride_a, double* tau, std::int64_t stride_tau, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dorgqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, float *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, float* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].spotrf_batch_usm_sycl( queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, double *a, std::int64_t lda, std::int64_t stride_a, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, double* a, std::int64_t lda, std::int64_t stride_a, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dpotrf_batch_usm_sycl( queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotrf_batch_usm_sycl( queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::complex *a, std::int64_t lda, +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::complex* a, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotrf_batch_usm_sycl( queue, uplo, n, a, lda, stride_a, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, float *a, std::int64_t lda, - std::int64_t stride_a, float *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, float* a, std::int64_t lda, + std::int64_t stride_a, float* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].spotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, double *a, std::int64_t lda, - std::int64_t stride_a, double *b, std::int64_t ldb, std::int64_t stride_b, - std::int64_t batch_size, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, double* a, std::int64_t lda, + std::int64_t stride_a, double* b, std::int64_t ldb, std::int64_t stride_b, + std::int64_t batch_size, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *b, std::int64_t ldb, +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, - std::int64_t n, std::int64_t nrhs, std::complex *a, - std::int64_t lda, std::int64_t stride_a, std::complex *b, +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, + std::int64_t n, std::int64_t nrhs, std::complex* a, + std::int64_t lda, std::int64_t stride_a, std::complex* b, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotrs_batch_usm_sycl(queue, uplo, n, nrhs, a, lda, stride_a, b, ldb, stride_b, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cungqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, - std::int64_t n, std::int64_t k, std::complex *a, std::int64_t lda, - std::int64_t stride_a, std::complex *tau, std::int64_t stride_tau, - std::int64_t batch_size, std::complex *scratchpad, +sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, + std::int64_t n, std::int64_t k, std::complex* a, std::int64_t lda, + std::int64_t stride_a, std::complex* tau, std::int64_t stride_tau, + std::int64_t batch_size, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zungqr_batch_usm_sycl(queue, m, n, k, a, lda, stride_a, tau, stride_tau, batch_size, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].sgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, double **tau, - std::int64_t group_count, std::int64_t *group_sizes, double *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, double** a, std::int64_t* lda, double** tau, + std::int64_t group_count, std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].dgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event geqrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zgeqrf_group_usm_sycl(queue, m, n, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, float **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad, +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, float** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].sgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, double **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, double *scratchpad, +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, double** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].dgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t **ipiv, std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t** ipiv, std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetrf_group_usm_sycl(queue, m, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, float **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, float** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, double **a, - std::int64_t *lda, std::int64_t **ipiv, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, double** a, + std::int64_t* lda, std::int64_t** ipiv, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *n, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getri_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* n, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zgetri_group_usm_sycl(queue, n, a, lda, ipiv, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - float **a, std::int64_t *lda, std::int64_t **ipiv, float **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + float** a, std::int64_t* lda, std::int64_t** ipiv, float** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].sgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - double **a, std::int64_t *lda, std::int64_t **ipiv, double **b, - std::int64_t *ldb, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + double** a, std::int64_t* lda, std::int64_t** ipiv, double** b, + std::int64_t* ldb, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, std::int64_t *nrhs, - std::complex **a, std::int64_t *lda, std::int64_t **ipiv, - std::complex **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event getrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, std::int64_t* nrhs, + std::complex** a, std::int64_t* lda, std::int64_t** ipiv, + std::complex** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zgetrs_group_usm_sycl(queue, trans, n, nrhs, a, lda, ipiv, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, float **a, std::int64_t *lda, float **tau, - std::int64_t group_count, std::int64_t *group_sizes, float *scratchpad, +sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, float** a, std::int64_t* lda, float** tau, + std::int64_t group_count, std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].sorgqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, double **a, std::int64_t *lda, - double **tau, std::int64_t group_count, std::int64_t *group_sizes, - double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event orgqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, double** a, std::int64_t* lda, + double** tau, std::int64_t group_count, std::int64_t* group_sizes, + double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dorgqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, float **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, float** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].spotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, double **a, std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, double** a, std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::complex **a, std::int64_t *lda, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrf_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::complex** a, std::int64_t* lda, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotrf_group_usm_sycl(queue, uplo, n, a, lda, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, float **a, std::int64_t *lda, - float **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, float *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, float** a, std::int64_t* lda, + float** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, float* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].spotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, double **a, std::int64_t *lda, - double **b, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes, double *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, double** a, std::int64_t* lda, + double** b, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes, double* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].dpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].cpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, - std::int64_t *n, std::int64_t *nrhs, std::complex **a, - std::int64_t *lda, std::complex **b, std::int64_t *ldb, - std::int64_t group_count, std::int64_t *group_sizes, - std::complex *scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { +sycl::event potrs_batch(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, + std::int64_t* n, std::int64_t* nrhs, std::complex** a, + std::int64_t* lda, std::complex** b, std::int64_t* ldb, + std::int64_t group_count, std::int64_t* group_sizes, + std::complex* scratchpad, std::int64_t scratchpad_size, + const std::vector& dependencies) { return function_tables[libkey].zpotrs_group_usm_sycl(queue, uplo, n, nrhs, a, lda, b, ldb, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].cungqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } -sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *k, std::complex **a, - std::int64_t *lda, std::complex **tau, std::int64_t group_count, - std::int64_t *group_sizes, std::complex *scratchpad, +sycl::event ungqr_batch(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* k, std::complex** a, + std::int64_t* lda, std::complex** tau, std::int64_t group_count, + std::int64_t* group_sizes, std::complex* scratchpad, std::int64_t scratchpad_size, - const std::vector &dependencies) { + const std::vector& dependencies) { return function_tables[libkey].zungqr_group_usm_sycl(queue, m, n, k, a, lda, tau, group_count, group_sizes, scratchpad, scratchpad_size, dependencies); } template <> -std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sgebrd_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gebrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dgebrd_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t gebrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cgebrd_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t gebrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zgebrd_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sgerqf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gerqf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dgerqf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t gerqf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cgerqf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t gerqf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zgerqf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sgeqrf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t geqrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dgeqrf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t geqrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cgeqrf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t geqrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zgeqrf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { @@ -2032,7 +2032,7 @@ std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queu ldvt); } template <> -std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldu, std::int64_t ldvt) { @@ -2041,7 +2041,7 @@ std::int64_t gesvd_scratchpad_size(oneapi::mkl::device libkey, sycl::que } template <> std::int64_t gesvd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, @@ -2051,7 +2051,7 @@ std::int64_t gesvd_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t gesvd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n, std::int64_t lda, @@ -2060,64 +2060,64 @@ std::int64_t gesvd_scratchpad_size>(oneapi::mkl::device lib ldvt); } template <> -std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sgetrf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dgetrf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t getrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cgetrf_scratchpad_size_sycl(queue, m, n, lda); } template <> std::int64_t getrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zgetrf_scratchpad_size_sycl(queue, m, n, lda); } template <> -std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sgetri_scratchpad_size_sycl(queue, n, lda); } template <> -std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dgetri_scratchpad_size_sycl(queue, n, lda); } template <> std::int64_t getri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cgetri_scratchpad_size_sycl(queue, n, lda); } template <> std::int64_t getri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t n, + sycl::queue& queue, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zgetri_scratchpad_size_sycl(queue, n, lda); } template <> -std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].sgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb); } template <> -std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].dgetrs_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb); } template <> std::int64_t getrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2125,7 +2125,7 @@ std::int64_t getrs_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t getrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2133,21 +2133,21 @@ std::int64_t getrs_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t heevd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::job jobz, + sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cheevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda); } template <> std::int64_t heevd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::job jobz, + sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zheevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda); } template <> std::int64_t hegvd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t itype, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -2156,7 +2156,7 @@ std::int64_t hegvd_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t hegvd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t itype, + sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -2165,66 +2165,66 @@ std::int64_t hegvd_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t hetrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].chetrd_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t hetrd_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zhetrd_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t hetrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].chetrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t hetrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zhetrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].sorgbr_scratchpad_size_sycl(queue, vect, m, n, k, lda); } template <> -std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgbr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].dorgbr_scratchpad_size_sycl(queue, vect, m, n, k, lda); } template <> -std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].sorgtr_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dorgtr_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].sorgqr_scratchpad_size_sycl(queue, m, n, k, lda); } template <> -std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].dorgqr_scratchpad_size_sycl(queue, m, n, k, lda); } template <> -std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2232,7 +2232,7 @@ std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queu ldc); } template <> -std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2240,7 +2240,7 @@ std::int64_t ormrq_scratchpad_size(oneapi::mkl::device libkey, sycl::que ldc); } template <> -std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2248,7 +2248,7 @@ std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queu ldc); } template <> -std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t ldc) { @@ -2256,7 +2256,7 @@ std::int64_t ormqr_scratchpad_size(oneapi::mkl::device libkey, sycl::que ldc); } template <> -std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2264,7 +2264,7 @@ std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queu ldc); } template <> -std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t ldc) { @@ -2272,117 +2272,117 @@ std::int64_t ormtr_scratchpad_size(oneapi::mkl::device libkey, sycl::que ldc); } template <> -std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].spotrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dpotrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t potrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cpotrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t potrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zpotrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].spotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].dpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb); } template <> std::int64_t potrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].cpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb); } template <> std::int64_t potrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { return function_tables[libkey].zpotrs_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb); } template <> -std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].spotri_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potri_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dpotri_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t potri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cpotri_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t potri_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zpotri_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].ssytrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrf_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dsytrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t sytrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].csytrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t sytrf_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zsytrf_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].ssyevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda); } template <> -std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t syevd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dsyevd_scratchpad_size_sycl(queue, jobz, uplo, n, lda); } template <> -std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -2390,7 +2390,7 @@ std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queu ldb); } template <> -std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t itype, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t ldb) { @@ -2398,19 +2398,19 @@ std::int64_t sygvd_scratchpad_size(oneapi::mkl::device libkey, sycl::que ldb); } template <> -std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].ssytrd_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t sytrd_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].dsytrd_scratchpad_size_sycl(queue, uplo, n, lda); } template <> -std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2418,7 +2418,7 @@ std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queu lda, ldb); } template <> -std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t ldb) { @@ -2427,7 +2427,7 @@ std::int64_t trtrs_scratchpad_size(oneapi::mkl::device libkey, sycl::que } template <> std::int64_t trtrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -2437,7 +2437,7 @@ std::int64_t trtrs_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t trtrs_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, std::int64_t n, std::int64_t nrhs, std::int64_t lda, @@ -2447,7 +2447,7 @@ std::int64_t trtrs_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t ungbr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { @@ -2455,7 +2455,7 @@ std::int64_t ungbr_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t ungbr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, + sycl::queue& queue, oneapi::mkl::generate vect, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { @@ -2463,33 +2463,33 @@ std::int64_t ungbr_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t ungqr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].cungqr_scratchpad_size_sycl(queue, m, n, k, lda); } template <> std::int64_t ungqr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t m, + sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda) { return function_tables[libkey].zungqr_scratchpad_size_sycl(queue, m, n, k, lda); } template <> std::int64_t ungtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].cungtr_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t ungtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::uplo uplo, + sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda) { return function_tables[libkey].zungtr_scratchpad_size_sycl(queue, uplo, n, lda); } template <> std::int64_t unmrq_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2499,7 +2499,7 @@ std::int64_t unmrq_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t unmrq_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2509,7 +2509,7 @@ std::int64_t unmrq_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t unmqr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2519,7 +2519,7 @@ std::int64_t unmqr_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t unmqr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, @@ -2529,7 +2529,7 @@ std::int64_t unmqr_scratchpad_size>(oneapi::mkl::device lib } template <> std::int64_t unmtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, @@ -2539,7 +2539,7 @@ std::int64_t unmtr_scratchpad_size>(oneapi::mkl::device libk } template <> std::int64_t unmtr_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, oneapi::mkl::side side, + sycl::queue& queue, oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, std::int64_t m, std::int64_t n, @@ -2548,7 +2548,7 @@ std::int64_t unmtr_scratchpad_size>(oneapi::mkl::device lib ldc); } template <> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2556,7 +2556,7 @@ std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl stride_ipiv, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2565,20 +2565,20 @@ std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return function_tables[libkey].cgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template <> std::int64_t getrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return function_tables[libkey].zgetrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2586,7 +2586,7 @@ std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl stride_ipiv, batch_size); } template <> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { @@ -2595,20 +2595,20 @@ std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return function_tables[libkey].cgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a, stride_ipiv, batch_size); } template <> std::int64_t getri_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t n, std::int64_t lda, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t batch_size) { return function_tables[libkey].zgetri_batch_scratchpad_size_sycl(queue, n, lda, stride_a, stride_ipiv, batch_size); } template <> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2618,7 +2618,7 @@ std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size); } template <> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, @@ -2629,7 +2629,7 @@ std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return function_tables[libkey].cgetrs_batch_scratchpad_size_sycl( @@ -2637,14 +2637,14 @@ std::int64_t getrs_batch_scratchpad_size>( } template <> std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose trans, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose trans, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_ipiv, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return function_tables[libkey].zgetrs_batch_scratchpad_size_sycl( queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2652,7 +2652,7 @@ std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl stride_tau, batch_size); } template <> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2661,20 +2661,20 @@ std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return function_tables[libkey].cgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> std::int64_t geqrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return function_tables[libkey].zgeqrf_batch_scratchpad_size_sycl(queue, m, n, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { @@ -2682,7 +2682,7 @@ std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl batch_size); } template <> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { @@ -2691,20 +2691,20 @@ std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { return function_tables[libkey].cpotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a, batch_size); } template <> std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda, std::int64_t stride_a, std::int64_t batch_size) { return function_tables[libkey].zpotrf_batch_scratchpad_size_sycl(queue, uplo, n, lda, stride_a, batch_size); } template <> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, @@ -2713,7 +2713,7 @@ std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size); } template <> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, @@ -2723,7 +2723,7 @@ std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return function_tables[libkey].cpotrs_batch_scratchpad_size_sycl( @@ -2731,14 +2731,14 @@ std::int64_t potrs_batch_scratchpad_size>( } template <> std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo uplo, std::int64_t n, + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t nrhs, std::int64_t lda, std::int64_t stride_a, std::int64_t ldb, std::int64_t stride_b, std::int64_t batch_size) { return function_tables[libkey].zpotrs_batch_scratchpad_size_sycl( queue, uplo, n, nrhs, lda, stride_a, ldb, stride_b, batch_size); } template <> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2746,7 +2746,7 @@ std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl stride_tau, batch_size); } template <> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { @@ -2755,245 +2755,245 @@ std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, syc } template <> std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return function_tables[libkey].cungqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a, stride_tau, batch_size); } template <> std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t m, std::int64_t n, std::int64_t k, + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t m, std::int64_t n, std::int64_t k, std::int64_t lda, std::int64_t stride_a, std::int64_t stride_tau, std::int64_t batch_size) { return function_tables[libkey].zungqr_batch_scratchpad_size_sycl(queue, m, n, k, lda, stride_a, stride_tau, batch_size); } template <> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].sgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> std::int64_t getrf_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].cgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> std::int64_t getrf_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].zgetrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].sgetri_group_scratchpad_size_sycl(queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *n, std::int64_t *lda, +std::int64_t getri_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].dgetri_group_scratchpad_size_sycl(queue, n, lda, group_count, group_sizes); } template <> std::int64_t getri_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, + sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].cgetri_group_scratchpad_size_sycl(queue, n, lda, group_count, group_sizes); } template <> std::int64_t getri_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *n, - std::int64_t *lda, + sycl::queue& queue, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].zgetri_group_scratchpad_size_sycl(queue, n, lda, group_count, group_sizes); } template <> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].sgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t getrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].cgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t getrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::transpose *trans, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::transpose* trans, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].zgetrs_group_scratchpad_size_sycl(queue, trans, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *lda, +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].sgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t geqrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> std::int64_t geqrf_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].cgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> std::int64_t geqrf_batch_scratchpad_size>(oneapi::mkl::device libkey, - sycl::queue &queue, std::int64_t *m, - std::int64_t *n, std::int64_t *lda, + sycl::queue& queue, std::int64_t* m, + std::int64_t* n, std::int64_t* lda, std::int64_t group_count, - std::int64_t *group_sizes) { + std::int64_t* group_sizes) { return function_tables[libkey].zgeqrf_group_scratchpad_size_sycl(queue, m, n, lda, group_count, group_sizes); } template <> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].sorgqr_group_scratchpad_size_sycl(queue, m, n, k, lda, group_count, group_sizes); } template <> -std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - std::int64_t *m, std::int64_t *n, std::int64_t *k, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t orgqr_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + std::int64_t* m, std::int64_t* n, std::int64_t* k, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dorgqr_group_scratchpad_size_sycl(queue, m, n, k, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].spotrf_group_scratchpad_size_sycl(queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t potrf_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda, group_count, group_sizes); } template <> std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes) { return function_tables[libkey].cpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda, group_count, group_sizes); } template <> std::int64_t potrf_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes) { return function_tables[libkey].zpotrf_group_scratchpad_size_sycl(queue, uplo, n, lda, group_count, group_sizes); } template <> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].spotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> -std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue &queue, - oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, - std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { +std::int64_t potrs_batch_scratchpad_size(oneapi::mkl::device libkey, sycl::queue& queue, + oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, + std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].dpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].cpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t potrs_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, oneapi::mkl::uplo *uplo, std::int64_t *n, - std::int64_t *nrhs, std::int64_t *lda, std::int64_t *ldb, std::int64_t group_count, - std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, oneapi::mkl::uplo* uplo, std::int64_t* n, + std::int64_t* nrhs, std::int64_t* lda, std::int64_t* ldb, std::int64_t group_count, + std::int64_t* group_sizes) { return function_tables[libkey].zpotrs_group_scratchpad_size_sycl(queue, uplo, n, nrhs, lda, ldb, group_count, group_sizes); } template <> std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes) { return function_tables[libkey].cungqr_group_scratchpad_size_sycl(queue, m, n, k, lda, group_count, group_sizes); } template <> std::int64_t ungqr_batch_scratchpad_size>( - oneapi::mkl::device libkey, sycl::queue &queue, std::int64_t *m, std::int64_t *n, - std::int64_t *k, std::int64_t *lda, std::int64_t group_count, std::int64_t *group_sizes) { + oneapi::mkl::device libkey, sycl::queue& queue, std::int64_t* m, std::int64_t* n, + std::int64_t* k, std::int64_t* lda, std::int64_t group_count, std::int64_t* group_sizes) { return function_tables[libkey].zungqr_group_scratchpad_size_sycl(queue, m, n, k, lda, group_count, group_sizes); } diff --git a/src/rng/backends/curand/curand_task.hpp b/src/rng/backends/curand/curand_task.hpp index adc08b840..3437bc07f 100644 --- a/src/rng/backends/curand/curand_task.hpp +++ b/src/rng/backends/curand/curand_task.hpp @@ -15,18 +15,18 @@ namespace rng { namespace curand { #ifdef __HIPSYCL__ template -static inline void host_task_internal(H &cgh, A acc, E e, F f) { +static inline void host_task_internal(H& cgh, A acc, E e, F f) { cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) { curandStatus_t status; CURAND_CALL(curandSetStream, status, e, ih.get_native_queue()); auto r_ptr = - reinterpret_cast(ih.get_native_mem(acc)); + reinterpret_cast(ih.get_native_mem(acc)); f(r_ptr); }); } template -static inline void host_task_internal(H &cgh, E e, F f) { +static inline void host_task_internal(H& cgh, E e, F f) { cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) { curandStatus_t status; CURAND_CALL(curandSetStream, status, e, ih.get_native_queue()); @@ -35,19 +35,19 @@ static inline void host_task_internal(H &cgh, E e, F f) { } #else template -static inline void host_task_internal(H &cgh, A acc, E e, F f) { +static inline void host_task_internal(H& cgh, A acc, E e, F f) { cgh.host_task([=](sycl::interop_handle ih) { curandStatus_t status; auto stream = ih.get_native_queue(); CURAND_CALL(curandSetStream, status, e, stream); - auto r_ptr = reinterpret_cast( + auto r_ptr = reinterpret_cast( ih.get_native_mem(acc)); f(r_ptr); }); } template -static inline void host_task_internal(H &cgh, E e, F f) { +static inline void host_task_internal(H& cgh, E e, F f) { cgh.host_task([=](sycl::interop_handle ih) { curandStatus_t status; auto stream = ih.get_native_queue(); @@ -57,12 +57,12 @@ static inline void host_task_internal(H &cgh, E e, F f) { } #endif template -static inline void onemkl_curand_host_task(H &cgh, A acc, E e, F f) { +static inline void onemkl_curand_host_task(H& cgh, A acc, E e, F f) { host_task_internal(cgh, acc, e, f); } template -static inline void onemkl_curand_host_task(H &cgh, Engine e, F f) { +static inline void onemkl_curand_host_task(H& cgh, Engine e, F f) { host_task_internal(cgh, e, f); } diff --git a/src/rng/backends/mklcpu/cpu_common.hpp b/src/rng/backends/mklcpu/cpu_common.hpp index cbd6cae59..a65338c91 100644 --- a/src/rng/backends/mklcpu/cpu_common.hpp +++ b/src/rng/backends/mklcpu/cpu_common.hpp @@ -34,19 +34,19 @@ namespace mklcpu { // host_task automatically uses run_on_host_intel if it is supported by the // compiler. Otherwise, it falls back to single_task. template -static inline auto host_task_internal(H &cgh, F f, int) -> decltype(cgh.host_task(f)) { +static inline auto host_task_internal(H& cgh, F f, int) -> decltype(cgh.host_task(f)) { return cgh.host_task(f); } template -static inline void host_task_internal(H &cgh, F f, long) { +static inline void host_task_internal(H& cgh, F f, long) { #ifndef __SYCL_DEVICE_ONLY__ cgh.template single_task(f); #endif } template -static inline void host_task(H &cgh, F f) { +static inline void host_task(H& cgh, F f) { (void)host_task_internal(cgh, f, 0); } @@ -57,7 +57,7 @@ template class kernel_name_usm {}; template -typename Acc::value_type *get_raw_ptr(Acc acc) { +typename Acc::value_type* get_raw_ptr(Acc acc) { // Workaround for AdaptiveCPP, as they do not yet support the get_multi_ptr function #ifndef __HIPSYCL__ return acc.template get_multi_ptr().get_raw(); diff --git a/src/rng/backends/rocrand/rocrand_task.hpp b/src/rng/backends/rocrand/rocrand_task.hpp index 2588dc901..fac8a154f 100644 --- a/src/rng/backends/rocrand/rocrand_task.hpp +++ b/src/rng/backends/rocrand/rocrand_task.hpp @@ -15,18 +15,18 @@ namespace rng { namespace rocrand { #ifdef __HIPSYCL__ template -static inline void host_task_internal(H &cgh, A acc, E e, F f) { +static inline void host_task_internal(H& cgh, A acc, E e, F f) { cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) { rocrand_status status; ROCRAND_CALL(rocrand_set_stream, status, e, ih.get_native_queue()); auto r_ptr = - reinterpret_cast(ih.get_native_mem(acc)); + reinterpret_cast(ih.get_native_mem(acc)); f(r_ptr); }); } template -static inline void host_task_internal(H &cgh, E e, F f) { +static inline void host_task_internal(H& cgh, E e, F f) { cgh.hipSYCL_enqueue_custom_operation([=](sycl::interop_handle ih) { rocrand_status status; ROCRAND_CALL(rocrand_set_stream, status, e, ih.get_native_queue()); @@ -35,12 +35,12 @@ static inline void host_task_internal(H &cgh, E e, F f) { } #else template -static inline void host_task_internal(H &cgh, A acc, E e, F f) { +static inline void host_task_internal(H& cgh, A acc, E e, F f) { cgh.host_task([=](sycl::interop_handle ih) { rocrand_status status; auto stream = ih.get_native_queue(); ROCRAND_CALL(rocrand_set_stream, status, e, stream); - auto r_ptr = reinterpret_cast( + auto r_ptr = reinterpret_cast( ih.get_native_mem(acc)); f(r_ptr); @@ -50,7 +50,7 @@ static inline void host_task_internal(H &cgh, A acc, E e, F f) { } template -static inline void host_task_internal(H &cgh, E e, F f) { +static inline void host_task_internal(H& cgh, E e, F f) { cgh.host_task([=](sycl::interop_handle ih) { rocrand_status status; auto stream = ih.get_native_queue(); @@ -63,12 +63,12 @@ static inline void host_task_internal(H &cgh, E e, F f) { } #endif template -static inline void onemkl_rocrand_host_task(H &cgh, A acc, E e, F f) { +static inline void onemkl_rocrand_host_task(H& cgh, A acc, E e, F f) { host_task_internal(cgh, acc, e, f); } template -static inline void onemkl_rocrand_host_task(H &cgh, Engine e, F f) { +static inline void onemkl_rocrand_host_task(H& cgh, Engine e, F f) { host_task_internal(cgh, e, f); } diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp index 59e582a65..b77db5529 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp @@ -33,11 +33,11 @@ namespace oneapi::mkl::sparse::cusparse { template struct cusparse_global_handle { - using handle_container_t = std::unordered_map *>; + using handle_container_t = std::unordered_map*>; handle_container_t cusparse_global_handle_mapper_{}; ~cusparse_global_handle() noexcept(false) { - for (auto &handle_pair : cusparse_global_handle_mapper_) { + for (auto& handle_pair : cusparse_global_handle_mapper_) { if (handle_pair.second != nullptr) { auto handle = handle_pair.second->exchange(nullptr); if (handle != nullptr) { diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index cd907fb6b..920e32a21 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -33,9 +33,9 @@ namespace oneapi::mkl::sparse::cusparse { // Dense vector template -void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, +void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, sycl::buffer val) { - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. @@ -51,9 +51,9 @@ void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, st } template -void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, std::int64_t size, - fpType *val) { - auto event = queue.submit([&](sycl::handler &cgh) { +void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, + fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. CusparseScopedContextHandler(queue, ih).get_handle(queue); @@ -67,10 +67,10 @@ void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, st } template -void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, std::int64_t size, +void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, std::int64_t size, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dvhandle, true); - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dvhandle->size != size) { @@ -91,8 +91,8 @@ void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, s } template -void set_dense_vector_data(sycl::queue &, dense_vector_handle_t dvhandle, std::int64_t size, - fpType *val) { +void set_dense_vector_data(sycl::queue&, dense_vector_handle_t dvhandle, std::int64_t size, + fpType* val) { detail::check_can_reset_value_handle(__func__, dvhandle, false); if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); @@ -109,8 +109,8 @@ void set_dense_vector_data(sycl::queue &, dense_vector_handle_t dvhandle, std::i FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); -sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, - const std::vector &dependencies) { +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector& dependencies) { // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); @@ -121,10 +121,10 @@ sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhan // Dense matrix template -void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, +void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, sycl::buffer val) { - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. @@ -142,9 +142,9 @@ void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, st } template -void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, std::int64_t num_rows, - std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType *val) { - auto event = queue.submit([&](sycl::handler &cgh) { +void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, + std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. CusparseScopedContextHandler(queue, ih).get_handle(queue); @@ -161,11 +161,11 @@ void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, st } template -void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dmhandle, true); - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || @@ -191,9 +191,9 @@ void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, } template -void set_dense_matrix_data(sycl::queue &, dense_matrix_handle_t dmhandle, std::int64_t num_rows, +void set_dense_matrix_data(sycl::queue&, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, - fpType *val) { + fpType* val) { detail::check_can_reset_value_handle(__func__, dmhandle, false); if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { @@ -215,8 +215,8 @@ void set_dense_matrix_data(sycl::queue &, dense_matrix_handle_t dmhandle, std::i FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); -sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, - const std::vector &dependencies) { +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector& dependencies) { // Use dispatch_submit_impl_fp to ensure the backend's handle is kept alive as long as the buffer is used auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); @@ -227,11 +227,11 @@ sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhan // COO matrix template -void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, +void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); @@ -245,18 +245,19 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, - num_rows, num_cols, nnz, index); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); } template -void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, +void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, - intType *row_ind, intType *col_ind, fpType *val) { - auto event = queue.submit([&](sycl::handler &cgh) { + intType* row_ind, intType* col_ind, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. CusparseScopedContextHandler(queue, ih).get_handle(queue); @@ -266,20 +267,21 @@ void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, - num_rows, num_cols, nnz, index); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); } template -void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, +void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { detail::check_can_reset_sparse_handle(__func__, smhandle, true); - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); @@ -312,9 +314,9 @@ void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } template -void set_coo_matrix_data(sycl::queue &, matrix_handle_t smhandle, std::int64_t num_rows, +void set_coo_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, - intType *row_ind, intType *col_ind, fpType *val) { + intType* row_ind, intType* col_ind, fpType* val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { @@ -341,11 +343,11 @@ FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); // CSR matrix template -void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, +void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); @@ -359,18 +361,19 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, - num_rows, num_cols, nnz, index); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); } template -void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, +void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, - intType *row_ptr, intType *col_ind, fpType *val) { - auto event = queue.submit([&](sycl::handler &cgh) { + intType* row_ptr, intType* col_ind, fpType* val) { + auto event = queue.submit([&](sycl::handler& cgh) { submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. CusparseScopedContextHandler(queue, ih).get_handle(queue); @@ -381,20 +384,21 @@ void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64 CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); - *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, - num_rows, num_cols, nnz, index); + *p_smhandle = + new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, + num_rows, num_cols, nnz, index); }); }); event.wait_and_throw(); } template -void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, +void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { detail::check_can_reset_sparse_handle(__func__, smhandle, true); - auto event = queue.submit([&](sycl::handler &cgh) { + auto event = queue.submit([&](sycl::handler& cgh) { auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); @@ -427,9 +431,9 @@ void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int6 } template -void set_csr_matrix_data(sycl::queue &, matrix_handle_t smhandle, std::int64_t num_rows, +void set_csr_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, - intType *row_ptr, intType *col_ind, fpType *val) { + intType* row_ptr, intType* col_ind, fpType* val) { detail::check_can_reset_sparse_handle(__func__, smhandle, false); if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { @@ -455,8 +459,8 @@ void set_csr_matrix_data(sycl::queue &, matrix_handle_t smhandle, std::int64_t n FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); -sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, - const std::vector &dependencies) { +sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, + const std::vector& dependencies) { // Use dispatch_submit to ensure the backend's handle is kept alive as long as the buffers are used auto functor = [=](sycl::interop_handle) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); @@ -466,7 +470,7 @@ sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, } // Matrix property -bool set_matrix_property(sycl::queue &, matrix_handle_t smhandle, matrix_property property) { +bool set_matrix_property(sycl::queue&, matrix_handle_t smhandle, matrix_property property) { // No equivalent in cuSPARSE // Store the matrix property internally for future usages smhandle->set_matrix_property(property); diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp index c25c7c92f..8b48d16dd 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp @@ -41,7 +41,7 @@ thread_local cusparse_global_handle CusparseScopedContextHandler::ha #endif CusparseScopedContextHandler::CusparseScopedContextHandler(sycl::queue queue, - sycl::interop_handle &ih) + sycl::interop_handle& ih) : ih(ih), needToRecover_(false) { placedContext_ = new sycl::context(queue.get_context()); @@ -69,8 +69,8 @@ CusparseScopedContextHandler::~CusparseScopedContextHandler() noexcept(false) { delete placedContext_; } -void ContextCallback(void *userData) { - auto *ptr = static_cast *>(userData); +void ContextCallback(void* userData) { + auto* ptr = static_cast*>(userData); if (!ptr) { return; } @@ -88,7 +88,7 @@ void ContextCallback(void *userData) { } std::pair CusparseScopedContextHandler::get_handle_and_stream( - const sycl::queue &queue) { + const sycl::queue& queue) { auto cudaDevice = ih.get_native_device(); CUcontext desired; CUDA_ERROR_FUNC(cuDevicePrimaryCtxRetain, &desired, cudaDevice); @@ -132,15 +132,15 @@ std::pair CusparseScopedContextHandler::get_handle_a return { handle, streamId }; } -cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue &queue) { +cusparseHandle_t CusparseScopedContextHandler::get_handle(const sycl::queue& queue) { return get_handle_and_stream(queue).first; } -CUstream CusparseScopedContextHandler::get_stream(const sycl::queue &queue) { +CUstream CusparseScopedContextHandler::get_stream(const sycl::queue& queue) { return sycl::get_native(queue); } -sycl::context CusparseScopedContextHandler::get_context(const sycl::queue &queue) { +sycl::context CusparseScopedContextHandler::get_context(const sycl::queue& queue) { return queue.get_context(); } diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp index d872cbab3..4b1ecd3e4 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -44,8 +44,8 @@ namespace oneapi::mkl::sparse::cusparse { class CusparseScopedContextHandler { CUcontext original_; - sycl::context *placedContext_; - sycl::interop_handle &ih; + sycl::context* placedContext_; + sycl::interop_handle& ih; bool needToRecover_; #ifdef ONEAPI_ONEMKL_PI_INTERFACE_REMOVED @@ -54,11 +54,11 @@ class CusparseScopedContextHandler { static thread_local cusparse_global_handle handle_helper; #endif - CUstream get_stream(const sycl::queue &queue); - sycl::context get_context(const sycl::queue &queue); + CUstream get_stream(const sycl::queue& queue); + sycl::context get_context(const sycl::queue& queue); public: - CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle &ih); + CusparseScopedContextHandler(sycl::queue queue, sycl::interop_handle& ih); ~CusparseScopedContextHandler() noexcept(false); @@ -69,18 +69,18 @@ class CusparseScopedContextHandler { * @param queue sycl queue. * @return a pair of: cusparseHandle_t a handle to construct cusparse routines; and a CUDA stream */ - std::pair get_handle_and_stream(const sycl::queue &queue); + std::pair get_handle_and_stream(const sycl::queue& queue); /// See get_handle_and_stream - cusparseHandle_t get_handle(const sycl::queue &queue); + cusparseHandle_t get_handle(const sycl::queue& queue); }; // Get the native pointer from an accessor. This is a different pointer than // what can be retrieved with get_multi_ptr. template -inline void *get_mem(sycl::interop_handle ih, AccT acc) { +inline void* get_mem(sycl::interop_handle ih, AccT acc) { auto cudaPtr = ih.get_native_mem(acc); - return reinterpret_cast(cudaPtr); + return reinterpret_cast(cudaPtr); } } // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index 3c51e8514..d380d9785 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -28,40 +28,40 @@ namespace oneapi::mkl::sparse::cusparse { template -auto get_value_accessor(sycl::handler &cgh, Container container) { +auto get_value_accessor(sycl::handler& cgh, Container container) { auto buffer_ptr = - reinterpret_cast *>(container->value_container.buffer_ptr.get()); + reinterpret_cast*>(container->value_container.buffer_ptr.get()); return buffer_ptr->template get_access(cgh); } template -auto get_fp_accessors(sycl::handler &cgh, Ts... containers) { +auto get_fp_accessors(sycl::handler& cgh, Ts... containers) { return std::array, sizeof...(containers)>{ get_value_accessor( cgh, containers)... }; } template -auto get_row_accessor(sycl::handler &cgh, matrix_handle_t smhandle) { +auto get_row_accessor(sycl::handler& cgh, matrix_handle_t smhandle) { auto buffer_ptr = - reinterpret_cast *>(smhandle->row_container.buffer_ptr.get()); + reinterpret_cast*>(smhandle->row_container.buffer_ptr.get()); return buffer_ptr->template get_access(cgh); } template -auto get_col_accessor(sycl::handler &cgh, matrix_handle_t smhandle) { +auto get_col_accessor(sycl::handler& cgh, matrix_handle_t smhandle) { auto buffer_ptr = - reinterpret_cast *>(smhandle->col_container.buffer_ptr.get()); + reinterpret_cast*>(smhandle->col_container.buffer_ptr.get()); return buffer_ptr->template get_access(cgh); } template -auto get_int_accessors(sycl::handler &cgh, matrix_handle_t smhandle) { +auto get_int_accessors(sycl::handler& cgh, matrix_handle_t smhandle) { return std::array, 2>{ get_row_accessor(cgh, smhandle), get_col_accessor(cgh, smhandle) }; } template -void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor, +void submit_host_task(sycl::handler& cgh, sycl::queue& queue, Functor functor, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly // handled. The accessors's pointer have already been set to the native @@ -78,7 +78,7 @@ void submit_host_task(sycl::handler &cgh, sycl::queue &queue, Functor functor, } template -void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor, +void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, sycl::accessor workspace_placeholder_acc, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly @@ -97,8 +97,8 @@ void submit_host_task_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor f } template -void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor functor, - const std::vector &dependencies, +void submit_native_command_ext(sycl::handler& cgh, sycl::queue& queue, Functor functor, + const std::vector& dependencies, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly // handled. The accessors's pointer have already been set to the native @@ -135,8 +135,8 @@ void submit_native_command_ext(sycl::handler &cgh, sycl::queue &queue, Functor f } template -void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, Functor functor, - const std::vector &dependencies, +void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, + const std::vector& dependencies, sycl::accessor workspace_placeholder_acc, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly @@ -191,8 +191,8 @@ void submit_native_command_ext_with_acc(sycl::handler &cgh, sycl::queue &queue, /// a different cuStream can be used inside the native_command than the native /// cuStream used by the extension. template -sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl::queue queue, - const std::vector &dependencies, +sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl::queue queue, + const std::vector& dependencies, Functor functor, matrix_handle_t sm_handle, sycl::buffer workspace_buffer, Ts... other_containers) { @@ -202,7 +202,7 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: detail::data_type int_type = sm_handle->get_int_type(); #define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ - return queue.submit([&](sycl::handler &cgh) { \ + return queue.submit([&](sycl::handler& cgh) { \ cgh.depends_on(dependencies); \ auto fp_accs = get_fp_accessors(cgh, sm_handle, other_containers...); \ auto int_accs = get_int_accessors(cgh, sm_handle); \ @@ -268,7 +268,7 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: else { // USM submit does not need to capture accessors if constexpr (!UseWorkspace) { - return queue.submit([&](sycl::handler &cgh) { + return queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); if constexpr (UseEnqueueNativeCommandExt) { if (is_in_order_queue) { @@ -292,14 +292,14 @@ sycl::event dispatch_submit_impl_fp_int(const std::string &function_name, sycl:: /// Similar to dispatch_submit_impl_fp_int but only dispatches the host_task based on the floating point value type. template -sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queue queue, - const std::vector &dependencies, Functor functor, +sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queue queue, + const std::vector& dependencies, Functor functor, ContainerT container_handle) { if (container_handle->all_use_buffer()) { detail::data_type value_type = container_handle->get_value_type(); #define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE) \ - return queue.submit([&](sycl::handler &cgh) { \ + return queue.submit([&](sycl::handler& cgh) { \ cgh.depends_on(dependencies); \ auto fp_accs = get_fp_accessors(cgh, container_handle); \ submit_host_task(cgh, queue, functor, fp_accs); \ @@ -324,7 +324,7 @@ sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queu "Could not dispatch buffer kernel to a supported type"); } else { - return queue.submit([&](sycl::handler &cgh) { + return queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); submit_host_task(cgh, queue, functor); }); @@ -333,7 +333,7 @@ sycl::event dispatch_submit_impl_fp(const std::string &function_name, sycl::queu /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor, +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor, matrix_handle_t sm_handle, sycl::buffer workspace_buffer, Ts... other_containers) { constexpr bool UseWorkspace = true; @@ -344,8 +344,8 @@ sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, - const std::vector &dependencies, Functor functor, +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, + const std::vector& dependencies, Functor functor, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; constexpr bool UseEnqueueNativeCommandExt = false; @@ -356,7 +356,7 @@ sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, Functor functor, +sycl::event dispatch_submit(const std::string& function_name, sycl::queue queue, Functor functor, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; constexpr bool UseEnqueueNativeCommandExt = false; @@ -367,7 +367,7 @@ sycl::event dispatch_submit(const std::string &function_name, sycl::queue queue, /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, Functor functor, matrix_handle_t sm_handle, sycl::buffer workspace_buffer, Ts... other_containers) { @@ -383,8 +383,8 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, - const std::vector &dependencies, +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, + const std::vector& dependencies, Functor functor, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; @@ -400,7 +400,7 @@ sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::q /// Helper function for dispatch_submit_impl_fp_int template -sycl::event dispatch_submit_native_ext(const std::string &function_name, sycl::queue queue, +sycl::event dispatch_submit_native_ext(const std::string& function_name, sycl::queue queue, Functor functor, matrix_handle_t sm_handle, Ts... other_containers) { constexpr bool UseWorkspace = false; diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index f00650f65..d1102b93a 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -54,12 +54,12 @@ struct spmv_descr { namespace oneapi::mkl::sparse::cusparse { -void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) { +void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { *p_spmv_descr = new spmv_descr(); } -sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, - const std::vector &dependencies) { +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies) { if (!spmv_descr) { return detail::collapse_dependencies(queue, dependencies); } @@ -88,7 +88,7 @@ sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, } // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded - sycl::event event = queue.submit([&](sycl::handler &cgh) { + sycl::event event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task(release_functor); }); @@ -105,7 +105,7 @@ inline auto get_cuda_spmv_alg(spmv_alg alg) { } } -void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA, +void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { @@ -119,10 +119,10 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o } } -void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -173,9 +173,9 @@ inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ #if CUSPARSE_VERSION >= 12300 // cusparseSpMV_preprocess was added in cuSPARSE 12.3.0.142 (CUDA 12.4) -void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, - matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void *beta, - dense_vector_handle_t y_handle, spmv_alg alg, void *workspace_ptr, +void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void* beta, + dense_vector_handle_t y_handle, spmv_alg alg, void* workspace_ptr, bool is_alpha_host_accessible) { auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; @@ -191,9 +191,9 @@ void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, } #endif -void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer workspace) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -234,11 +234,11 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a #endif } -sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - void *workspace, const std::vector &dependencies) { + void* workspace, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); if (A_handle->all_use_buffer()) { @@ -264,10 +264,10 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const #endif } -sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, const std::vector &dependencies) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -288,7 +288,7 @@ sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alp CHECK_DESCR_MATCH(spmv_descr, alg, "spmv_optimize"); bool is_in_order_queue = queue.is_in_order(); - auto compute_functor = [=](void *workspace_ptr) { + auto compute_functor = [=](void* workspace_ptr) { auto cu_handle = spmv_descr->cu_handle; auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 392318460..4f2b60502 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -54,13 +54,13 @@ struct spsv_descr { namespace oneapi::mkl::sparse::cusparse { -void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { +void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) { *p_spsv_descr = new spsv_descr(); CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr); } -sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, - const std::vector &dependencies) { +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector& dependencies) { if (!spsv_descr) { return detail::collapse_dependencies(queue, dependencies); } @@ -91,7 +91,7 @@ sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, } // Release used if USM is used or if the descriptor has been released before spsv_optimize has succeeded - sycl::event event = queue.submit([&](sycl::handler &cgh) { + sycl::event event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task(release_functor); }); @@ -102,7 +102,7 @@ inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { return CUSPARSE_SPSV_ALG_DEFAULT; } -void check_valid_spsv(const std::string &function_name, matrix_view A_view, +void check_valid_spsv(const std::string& function_name, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { detail::check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, @@ -110,10 +110,10 @@ void check_valid_spsv(const std::string &function_name, matrix_view A_view, check_valid_matrix_properties(function_name, A_handle); } -void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { @@ -159,10 +159,10 @@ inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_ spsv_descr->last_optimized_alg = alg; } -void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void *alpha, +void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void *workspace_ptr, + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace_ptr, bool is_alpha_host_accessible) { auto cu_a = A_handle->backend_handle; auto cu_x = x_handle->backend_handle; @@ -179,7 +179,7 @@ void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, check_status(status, "spsv_optimize"); } -void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace) { @@ -215,11 +215,11 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } } -sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies) { + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); if (A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); @@ -236,10 +236,10 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); } -sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies) { + const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { diff --git a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx index 6ddbd43ef..5fa5ea0a4 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_handles.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_handles.cxx @@ -21,19 +21,19 @@ // Dense vector template -void init_dense_vector(sycl::queue & /*queue*/, dense_vector_handle_t *p_dvhandle, - std::int64_t size, sycl::buffer val) { +void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size, + sycl::buffer val) { *p_dvhandle = new dense_vector_handle(val, size); } template -void init_dense_vector(sycl::queue & /*queue*/, dense_vector_handle_t *p_dvhandle, - std::int64_t size, fpType *val) { +void init_dense_vector(sycl::queue& /*queue*/, dense_vector_handle_t* p_dvhandle, std::int64_t size, + fpType* val) { *p_dvhandle = new dense_vector_handle(val, size); } template -void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhandle, +void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle, std::int64_t size, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dvhandle, true); dvhandle->size = size; @@ -41,8 +41,8 @@ void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhand } template -void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhandle, - std::int64_t size, fpType *val) { +void set_dense_vector_data(sycl::queue& /*queue*/, dense_vector_handle_t dvhandle, + std::int64_t size, fpType* val) { detail::check_can_reset_value_handle(__func__, dvhandle, false); dvhandle->size = size; dvhandle->set_usm_ptr(val); @@ -50,28 +50,28 @@ void set_dense_vector_data(sycl::queue & /*queue*/, dense_vector_handle_t dvhand FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_VECTOR_FUNCS); -sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, - const std::vector &dependencies) { +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector& dependencies) { return detail::submit_release(queue, dvhandle, dependencies); } // Dense matrix template -void init_dense_matrix(sycl::queue & /*queue*/, dense_matrix_handle_t *p_dmhandle, +void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template -void init_dense_matrix(sycl::queue & /*queue*/, dense_matrix_handle_t *p_dmhandle, +void init_dense_matrix(sycl::queue& /*queue*/, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, - oneapi::mkl::layout dense_layout, fpType *val) { + oneapi::mkl::layout dense_layout, fpType* val) { *p_dmhandle = new dense_matrix_handle(val, num_rows, num_cols, ld, dense_layout); } template -void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, oneapi::mkl::layout dense_layout, sycl::buffer val) { detail::check_can_reset_value_handle(__func__, dmhandle, true); @@ -83,9 +83,9 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhand } template -void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhandle, +void set_dense_matrix_data(sycl::queue& /*queue*/, dense_matrix_handle_t dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, - oneapi::mkl::layout dense_layout, fpType *val) { + oneapi::mkl::layout dense_layout, fpType* val) { detail::check_can_reset_value_handle(__func__, dmhandle, false); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; @@ -96,21 +96,22 @@ void set_dense_matrix_data(sycl::queue & /*queue*/, dense_matrix_handle_t dmhand FOR_EACH_FP_TYPE(INSTANTIATE_DENSE_MATRIX_FUNCS); -sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, - const std::vector &dependencies) { +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector& dependencies) { return detail::submit_release(queue, dmhandle, dependencies); } // COO matrix template -void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, +void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle( - mkl_handle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + detail::sparse_format::COO, num_rows, + num_cols, nnz, index); // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_coo_data(queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), @@ -121,14 +122,15 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p } template -void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, +void init_coo_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, - fpType *val) { + oneapi::mkl::index_base index, intType* row_ind, intType* col_ind, + fpType* val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle( - mkl_handle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ind, col_ind, val, + detail::sparse_format::COO, num_rows, + num_cols, nnz, index); auto event = oneapi::mkl::sparse::set_coo_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), static_cast(nnz), index, row_ind, col_ind, val); @@ -137,7 +139,7 @@ void init_coo_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p } template -void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, +void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ind, sycl::buffer col_ind, sycl::buffer val) { @@ -160,10 +162,10 @@ void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ } template -void set_coo_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, +void set_coo_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - oneapi::mkl::index_base index, intType *row_ind, intType *col_ind, - fpType *val) { + oneapi::mkl::index_base index, intType* row_ind, intType* col_ind, + fpType* val) { auto internal_smhandle = detail::get_internal_handle(smhandle); detail::check_can_reset_sparse_handle(__func__, internal_smhandle, false); internal_smhandle->num_rows = num_rows; @@ -183,14 +185,15 @@ FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_COO_MATRIX_FUNCS); // CSR matrix template -void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, +void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle( - mkl_handle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + detail::sparse_format::CSR, num_rows, + num_cols, nnz, index); // The backend deduces nnz from row_ptr. // The backend handle must use the buffers from the internal handle as they will be kept alive until the handle is released. oneapi::mkl::sparse::set_csr_data(queue, mkl_handle, static_cast(num_rows), @@ -202,14 +205,15 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p } template -void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p_smhandle, +void init_csr_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, - fpType *val) { + oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind, + fpType* val) { oneapi::mkl::sparse::matrix_handle_t mkl_handle; oneapi::mkl::sparse::init_matrix_handle(&mkl_handle); - auto internal_smhandle = new detail::sparse_matrix_handle( - mkl_handle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); + auto internal_smhandle = new detail::sparse_matrix_handle(mkl_handle, row_ptr, col_ind, val, + detail::sparse_format::CSR, num_rows, + num_cols, nnz, index); // The backend deduces nnz from row_ptr. auto event = oneapi::mkl::sparse::set_csr_data( queue, mkl_handle, static_cast(num_rows), static_cast(num_cols), index, @@ -219,7 +223,7 @@ void init_csr_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t *p } template -void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, +void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, sycl::buffer row_ptr, sycl::buffer col_ind, sycl::buffer val) { @@ -243,10 +247,10 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ } template -void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, +void set_csr_matrix_data(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, - oneapi::mkl::index_base index, intType *row_ptr, intType *col_ind, - fpType *val) { + oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind, + fpType* val) { auto internal_smhandle = detail::get_internal_handle(smhandle); detail::check_can_reset_sparse_handle(__func__, internal_smhandle, false); internal_smhandle->num_rows = num_rows; @@ -266,8 +270,8 @@ void set_csr_matrix_data(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_ FOR_EACH_FP_AND_INT_TYPE(INSTANTIATE_CSR_MATRIX_FUNCS); // Common sparse matrix functions -sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - const std::vector &dependencies) { +sycl::event release_sparse_matrix(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, + const std::vector& dependencies) { auto internal_smhandle = detail::get_internal_handle(smhandle); // Asynchronously release the backend's handle followed by the internal handle. auto event = oneapi::mkl::sparse::release_matrix_handle( @@ -275,7 +279,7 @@ sycl::event release_sparse_matrix(sycl::queue &queue, oneapi::mkl::sparse::matri return detail::submit_release(queue, internal_smhandle, { event }); } -bool set_matrix_property(sycl::queue & /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle, +bool set_matrix_property(sycl::queue& /*queue*/, oneapi::mkl::sparse::matrix_handle_t smhandle, matrix_property property) { auto internal_smhandle = detail::get_internal_handle(smhandle); // Store the matrix property internally for better error checking diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx index ad12edcfb..9c0bc577b 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmm.cxx @@ -37,16 +37,16 @@ struct spmm_descr { namespace oneapi::mkl::sparse::BACKEND { -void init_spmm_descr(sycl::queue & /*queue*/, spmm_descr_t *p_spmm_descr) { +void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { *p_spmm_descr = new spmm_descr(); } -sycl::event release_spmm_descr(sycl::queue &queue, spmm_descr_t spmm_descr, - const std::vector &dependencies) { +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies) { return detail::submit_release(queue, spmm_descr, dependencies); } -void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose opA, +void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose opA, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { @@ -69,11 +69,11 @@ void check_valid_spmm(const std::string &function_name, oneapi::mkl::transpose o #endif // BACKEND } -void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose /*opB*/, const void *alpha, matrix_view A_view, - matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void *beta, +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose /*opB*/, const void* alpha, matrix_view A_view, + matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/, spmm_descr_t spmm_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -83,10 +83,10 @@ void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, spmm_descr->buffer_size_called = true; } -inline void common_spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +inline void common_spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -106,9 +106,9 @@ inline void common_spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, spmm_descr->last_optimized_alg = alg; } -void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); @@ -124,12 +124,12 @@ void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl:: // TODO: Add support for spmm_optimize once the close-source oneMKL backend supports it. } -sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, void * /*workspace*/, - const std::vector &dependencies) { + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* /*workspace*/, + const std::vector& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); @@ -145,16 +145,16 @@ sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, } template -sycl::event internal_spmm(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view /*A_view*/, +sycl::event internal_spmm(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view /*A_view*/, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/, - spmm_descr_t /*spmm_descr*/, const std::vector &dependencies, + const void* beta, dense_matrix_handle_t C_handle, spmm_alg /*alg*/, + spmm_descr_t /*spmm_descr*/, const std::vector& dependencies, bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = - detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); + detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); T host_beta = - detail::get_scalar_on_host(queue, static_cast(beta), is_beta_host_accessible); + detail::get_scalar_on_host(queue, static_cast(beta), is_beta_host_accessible); auto internal_A_handle = detail::get_internal_handle(A_handle); internal_A_handle->can_be_reset = false; auto layout = B_handle->dense_layout; @@ -176,11 +176,11 @@ sycl::event internal_spmm(sycl::queue &queue, oneapi::mkl::transpose opA, } } -sycl::event spmm(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - const std::vector &dependencies) { + const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmm(__func__, opA, A_view, A_handle, B_handle, C_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx index 1859257e4..9fc43d8e9 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spmv.cxx @@ -36,16 +36,16 @@ struct spmv_descr { namespace oneapi::mkl::sparse::BACKEND { -void init_spmv_descr(sycl::queue & /*queue*/, spmv_descr_t *p_spmv_descr) { +void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { *p_spmv_descr = new spmv_descr(); } -sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, - const std::vector &dependencies) { +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies) { return detail::submit_release(queue, spmv_descr, dependencies); } -void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose opA, +void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose opA, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { @@ -62,10 +62,10 @@ void check_valid_spmv(const std::string &function_name, oneapi::mkl::transpose o } } -void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); @@ -75,9 +75,9 @@ void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void spmv_descr->buffer_size_called = true; } -inline void common_spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +inline void common_spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); @@ -97,9 +97,9 @@ inline void common_spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, spmv_descr->last_optimized_alg = alg; } -void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer /*workspace*/) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (!internal_A_handle->all_use_buffer()) { @@ -125,11 +125,11 @@ void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a } } -sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - void * /*workspace*/, const std::vector &dependencies) { + void* /*workspace*/, const std::vector& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); @@ -155,16 +155,16 @@ sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } template -sycl::event internal_spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event internal_spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg /*alg*/, - spmv_descr_t /*spmv_descr*/, const std::vector &dependencies, + spmv_descr_t /*spmv_descr*/, const std::vector& dependencies, bool is_alpha_host_accessible, bool is_beta_host_accessible) { T host_alpha = - detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); + detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); T host_beta = - detail::get_scalar_on_host(queue, static_cast(beta), is_beta_host_accessible); + detail::get_scalar_on_host(queue, static_cast(beta), is_beta_host_accessible); auto internal_A_handle = detail::get_internal_handle(A_handle); internal_A_handle->can_be_reset = false; auto backend_handle = internal_A_handle->backend_handle; @@ -207,10 +207,10 @@ sycl::event internal_spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const } } -sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, const std::vector &dependencies) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, diff --git a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx index 56a2491b2..dd2a4f627 100644 --- a/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx +++ b/src/sparse_blas/backends/mkl_common/mkl_spsv.cxx @@ -36,16 +36,16 @@ struct spsv_descr { namespace oneapi::mkl::sparse::BACKEND { -void init_spsv_descr(sycl::queue & /*queue*/, spsv_descr_t *p_spsv_descr) { +void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) { *p_spsv_descr = new spsv_descr(); } -sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, - const std::vector &dependencies) { +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector& dependencies) { return detail::submit_release(queue, spsv_descr, dependencies); } -void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose opA, +void check_valid_spsv(const std::string& function_name, oneapi::mkl::transpose opA, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, spsv_alg alg) { auto internal_A_handle = detail::get_internal_handle(A_handle); @@ -72,10 +72,10 @@ void check_valid_spsv(const std::string &function_name, oneapi::mkl::transpose o #endif // BACKEND } -void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { // TODO: Add support for external workspace once the close-source oneMKL backend supports it. bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, @@ -84,7 +84,7 @@ void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void spsv_descr->buffer_size_called = true; } -inline void common_spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +inline void common_spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr) { @@ -104,7 +104,7 @@ inline void common_spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, spsv_descr->last_optimized_alg = alg; } -void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer /*workspace*/) { @@ -121,11 +121,11 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a internal_A_handle->backend_handle); } -sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void * /*workspace*/, - const std::vector &dependencies) { + spsv_alg alg, spsv_descr_t spsv_descr, void* /*workspace*/, + const std::vector& dependencies) { auto internal_A_handle = detail::get_internal_handle(A_handle); if (internal_A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); @@ -140,14 +140,14 @@ sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const } template -sycl::event internal_spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event internal_spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg /*alg*/, spsv_descr_t /*spsv_descr*/, - const std::vector &dependencies, + const std::vector& dependencies, bool is_alpha_host_accessible) { T host_alpha = - detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); + detail::get_scalar_on_host(queue, static_cast(alpha), is_alpha_host_accessible); auto internal_A_handle = detail::get_internal_handle(A_handle); internal_A_handle->can_be_reset = false; if (internal_A_handle->all_use_buffer()) { @@ -165,10 +165,10 @@ sycl::event internal_spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const } } -sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies) { + const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); check_valid_spsv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, alg); diff --git a/src/sparse_blas/common_op_verification.hpp b/src/sparse_blas/common_op_verification.hpp index be31ad43f..318766fb4 100644 --- a/src/sparse_blas/common_op_verification.hpp +++ b/src/sparse_blas/common_op_verification.hpp @@ -34,8 +34,8 @@ namespace oneapi::mkl::sparse::detail { /// Throw an exception if the scalar is not accessible in the host -inline void check_ptr_is_host_accessible(const std::string &function_name, - const std::string &scalar_name, +inline void check_ptr_is_host_accessible(const std::string& function_name, + const std::string& scalar_name, bool is_ptr_accessible_on_host) { if (!is_ptr_accessible_on_host) { throw mkl::invalid_argument( @@ -45,7 +45,7 @@ inline void check_ptr_is_host_accessible(const std::string &function_name, } template -void check_valid_spmm_common(const std::string &function_name, matrix_view A_view, +void check_valid_spmm_common(const std::string& function_name, matrix_view A_view, InternalSparseMatHandleT internal_A_handle, dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { @@ -80,7 +80,7 @@ void check_valid_spmm_common(const std::string &function_name, matrix_view A_vie } template -void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::transpose /*opA*/, +void check_valid_spmv_common(const std::string& function_name, oneapi::mkl::transpose /*opA*/, matrix_view A_view, InternalSparseMatHandleT internal_A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { @@ -112,7 +112,7 @@ void check_valid_spmv_common(const std::string &function_name, oneapi::mkl::tran } template -void check_valid_spsv_common(const std::string &function_name, matrix_view A_view, +void check_valid_spsv_common(const std::string& function_name, matrix_view A_view, InternalSparseMatHandleT internal_A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { diff --git a/src/sparse_blas/function_table.hpp b/src/sparse_blas/function_table.hpp index d1e3d8189..429468ca1 100644 --- a/src/sparse_blas/function_table.hpp +++ b/src/sparse_blas/function_table.hpp @@ -30,13 +30,13 @@ std::int64_t size, sycl::buffer val); \ void (*init_dense_vector_usm##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, FP_TYPE * val); \ + std::int64_t size, FP_TYPE* val); \ void (*set_dense_vector_data_buffer##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ std::int64_t size, sycl::buffer val); \ void (*set_dense_vector_data_usm##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, FP_TYPE * val) + std::int64_t size, FP_TYPE* val) // Dense matrix #define DEFINE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ @@ -47,7 +47,7 @@ void (*init_dense_matrix_usm##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val); \ + oneapi::mkl::layout dense_layout, FP_TYPE* val); \ void (*set_dense_matrix_data_buffer##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ @@ -55,7 +55,7 @@ void (*set_dense_matrix_data_usm##FP_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val) + oneapi::mkl::layout dense_layout, FP_TYPE* val) // COO matrix #define DEFINE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ @@ -67,7 +67,7 @@ void (*init_coo_matrix_usm##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val); \ + oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val); \ void (*set_coo_matrix_data_buffer##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ @@ -75,8 +75,8 @@ sycl::buffer val); \ void (*set_coo_matrix_data_usm##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val) + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \ + INT_TYPE* col_ind, FP_TYPE* val) // CSR matrix #define DEFINE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ @@ -88,7 +88,7 @@ void (*init_csr_matrix_usm##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val); \ + oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val); \ void (*set_csr_matrix_data_buffer##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ @@ -96,23 +96,23 @@ sycl::buffer val); \ void (*set_csr_matrix_data_usm##FP_SUFFIX##INT_SUFFIX)( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val) + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \ + INT_TYPE* col_ind, FP_TYPE* val) typedef struct { int version; // Dense vector FOR_EACH_FP_TYPE(DEFINE_DENSE_VECTOR_FUNCS); - sycl::event (*release_dense_vector)(sycl::queue &queue, + sycl::event (*release_dense_vector)(sycl::queue& queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, - const std::vector &dependencies); + const std::vector& dependencies); // Dense matrix FOR_EACH_FP_TYPE(DEFINE_DENSE_MATRIX_FUNCS); - sycl::event (*release_dense_matrix)(sycl::queue &queue, + sycl::event (*release_dense_matrix)(sycl::queue& queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, - const std::vector &dependencies); + const std::vector& dependencies); // COO matrix FOR_EACH_FP_AND_INT_TYPE(DEFINE_COO_MATRIX_FUNCS); @@ -121,117 +121,117 @@ typedef struct { FOR_EACH_FP_AND_INT_TYPE(DEFINE_CSR_MATRIX_FUNCS); // Common sparse matrix functions - sycl::event (*release_sparse_matrix)(sycl::queue &queue, + sycl::event (*release_sparse_matrix)(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, - const std::vector &dependencies); + const std::vector& dependencies); - bool (*set_matrix_property)(sycl::queue &queue, oneapi::mkl::sparse::matrix_handle_t smhandle, + bool (*set_matrix_property)(sycl::queue& queue, oneapi::mkl::sparse::matrix_handle_t smhandle, oneapi::mkl::sparse::matrix_property property); // SPMM - void (*init_spmm_descr)(sycl::queue &queue, oneapi::mkl::sparse::spmm_descr_t *p_spmm_descr); + void (*init_spmm_descr)(sycl::queue& queue, oneapi::mkl::sparse::spmm_descr_t* p_spmm_descr); - sycl::event (*release_spmm_descr)(sycl::queue &queue, + sycl::event (*release_spmm_descr)(sycl::queue& queue, oneapi::mkl::sparse::spmm_descr_t spmm_descr, - const std::vector &dependencies); + const std::vector& dependencies); - void (*spmm_buffer_size)(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, + void (*spmm_buffer_size)(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); void (*spmm_optimize_buffer)( - sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, + sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, sycl::buffer workspace); - sycl::event (*spmm_optimize_usm)(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, + sycl::event (*spmm_optimize_usm)(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_matrix_handle_t B_handle, - const void *beta, + const void* beta, oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, - oneapi::mkl::sparse::spmm_descr_t spmm_descr, void *workspace, - const std::vector &dependencies); + oneapi::mkl::sparse::spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies); - sycl::event (*spmm)(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, + sycl::event (*spmm)(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void *beta, + oneapi::mkl::sparse::dense_matrix_handle_t B_handle, const void* beta, oneapi::mkl::sparse::dense_matrix_handle_t C_handle, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::spmm_descr_t spmm_descr, - const std::vector &dependencies); + const std::vector& dependencies); // SPMV - void (*init_spmv_descr)(sycl::queue &queue, oneapi::mkl::sparse::spmv_descr_t *p_spmv_descr); + void (*init_spmv_descr)(sycl::queue& queue, oneapi::mkl::sparse::spmv_descr_t* p_spmv_descr); - sycl::event (*release_spmv_descr)(sycl::queue &queue, + sycl::event (*release_spmv_descr)(sycl::queue& queue, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - const std::vector &dependencies); + const std::vector& dependencies); - void (*spmv_buffer_size)(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + void (*spmv_buffer_size)(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); void (*spmv_optimize_buffer)( - sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, sycl::buffer workspace); - sycl::event (*spmv_optimize_usm)(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, + sycl::event (*spmv_optimize_usm)(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_vector_handle_t x_handle, - const void *beta, + const void* beta, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spmv_alg alg, - oneapi::mkl::sparse::spmv_descr_t spmv_descr, void *workspace, - const std::vector &dependencies); + oneapi::mkl::sparse::spmv_descr_t spmv_descr, void* workspace, + const std::vector& dependencies); - sycl::event (*spmv)(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + sycl::event (*spmv)(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, - oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void *beta, + oneapi::mkl::sparse::dense_vector_handle_t x_handle, const void* beta, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::spmv_descr_t spmv_descr, - const std::vector &dependencies); + const std::vector& dependencies); // SPSV - void (*init_spsv_descr)(sycl::queue &queue, oneapi::mkl::sparse::spsv_descr_t *p_spsv_descr); + void (*init_spsv_descr)(sycl::queue& queue, oneapi::mkl::sparse::spsv_descr_t* p_spsv_descr); - sycl::event (*release_spsv_descr)(sycl::queue &queue, + sycl::event (*release_spsv_descr)(sycl::queue& queue, oneapi::mkl::sparse::spsv_descr_t spsv_descr, - const std::vector &dependencies); + const std::vector& dependencies); - void (*spsv_buffer_size)(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + void (*spsv_buffer_size)(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_vector_handle_t x_handle, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size); + std::size_t& temp_buffer_size); - void (*spsv_optimize_buffer)(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + void (*spsv_optimize_buffer)(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_vector_handle_t x_handle, @@ -240,23 +240,23 @@ typedef struct { oneapi::mkl::sparse::spsv_descr_t spsv_descr, sycl::buffer workspace); - sycl::event (*spsv_optimize_usm)(sycl::queue &queue, oneapi::mkl::transpose opA, - const void *alpha, oneapi::mkl::sparse::matrix_view A_view, + sycl::event (*spsv_optimize_usm)(sycl::queue& queue, oneapi::mkl::transpose opA, + const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_vector_handle_t x_handle, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spsv_alg alg, - oneapi::mkl::sparse::spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies); + oneapi::mkl::sparse::spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies); - sycl::event (*spsv)(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, + sycl::event (*spsv)(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, oneapi::mkl::sparse::matrix_view A_view, oneapi::mkl::sparse::matrix_handle_t A_handle, oneapi::mkl::sparse::dense_vector_handle_t x_handle, oneapi::mkl::sparse::dense_vector_handle_t y_handle, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::spsv_descr_t spsv_descr, - const std::vector &dependencies); + const std::vector& dependencies); } sparse_blas_function_table_t; #undef DEFINE_DENSE_VECTOR_FUNCS diff --git a/src/sparse_blas/macros.hpp b/src/sparse_blas/macros.hpp index 9eb769736..72aa39a75 100644 --- a/src/sparse_blas/macros.hpp +++ b/src/sparse_blas/macros.hpp @@ -42,13 +42,13 @@ std::int64_t size, sycl::buffer val); \ template void init_dense_vector( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t * p_dvhandle, \ - std::int64_t size, FP_TYPE * val); \ + std::int64_t size, FP_TYPE* val); \ template void set_dense_vector_data( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ std::int64_t size, sycl::buffer val); \ template void set_dense_vector_data( \ sycl::queue & queue, oneapi::mkl::sparse::dense_vector_handle_t dvhandle, \ - std::int64_t size, FP_TYPE * val) + std::int64_t size, FP_TYPE* val) #define INSTANTIATE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ template void init_dense_matrix( \ @@ -58,7 +58,7 @@ template void init_dense_matrix( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t * p_dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val); \ + oneapi::mkl::layout dense_layout, FP_TYPE* val); \ template void set_dense_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ @@ -66,7 +66,7 @@ template void set_dense_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - oneapi::mkl::layout dense_layout, FP_TYPE * val) + oneapi::mkl::layout dense_layout, FP_TYPE* val) #define INSTANTIATE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ template void init_coo_matrix( \ @@ -77,7 +77,7 @@ template void init_coo_matrix( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val); \ + oneapi::mkl::index_base index, INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val); \ template void set_coo_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ @@ -85,8 +85,8 @@ sycl::buffer val); \ template void set_coo_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ind, INT_TYPE * col_ind, FP_TYPE * val) + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ind, \ + INT_TYPE* col_ind, FP_TYPE* val) #define INSTANTIATE_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ template void init_csr_matrix( \ @@ -97,7 +97,7 @@ template void init_csr_matrix( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t * p_smhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, \ - oneapi::mkl::index_base index, INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val); \ + oneapi::mkl::index_base index, INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val); \ template void set_csr_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ @@ -105,8 +105,8 @@ sycl::buffer val); \ template void set_csr_matrix_data( \ sycl::queue & queue, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, \ - std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, \ - INT_TYPE * row_ptr, INT_TYPE * col_ind, FP_TYPE * val) + std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, INT_TYPE* row_ptr, \ + INT_TYPE* col_ind, FP_TYPE* val) #define THROW_IF_NULLPTR(FUNC_NAME, PTR) \ if (!(PTR)) { \ diff --git a/src/sparse_blas/sparse_blas_loader.cpp b/src/sparse_blas/sparse_blas_loader.cpp index cdc3ae6b2..82ce9c6a9 100644 --- a/src/sparse_blas/sparse_blas_loader.cpp +++ b/src/sparse_blas/sparse_blas_loader.cpp @@ -33,35 +33,35 @@ static oneapi::mkl::detail::table_initializer \ - void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, \ + void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, \ std::int64_t size, sycl::buffer val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].init_dense_vector_buffer##FP_SUFFIX(queue, p_dvhandle, size, val); \ } \ template <> \ - void init_dense_vector(sycl::queue &queue, dense_vector_handle_t *p_dvhandle, \ - std::int64_t size, FP_TYPE *val) { \ + void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, \ + std::int64_t size, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].init_dense_vector_usm##FP_SUFFIX(queue, p_dvhandle, size, val); \ } \ template <> \ - void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, \ + void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, \ std::int64_t size, sycl::buffer val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].set_dense_vector_data_buffer##FP_SUFFIX(queue, dvhandle, size, \ val); \ } \ template <> \ - void set_dense_vector_data(sycl::queue &queue, dense_vector_handle_t dvhandle, \ - std::int64_t size, FP_TYPE *val) { \ + void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, \ + std::int64_t size, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].set_dense_vector_data_usm##FP_SUFFIX(queue, dvhandle, size, val); \ } FOR_EACH_FP_TYPE(DEFINE_DENSE_VECTOR_FUNCS); #undef DEFINE_DENSE_VECTOR_FUNCS -sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhandle, - const std::vector &dependencies) { +sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhandle, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_dense_vector(queue, dvhandle, dependencies); } @@ -69,7 +69,7 @@ sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhan // Dense matrix #define DEFINE_DENSE_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX) \ template <> \ - void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, \ + void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ layout dense_layout, sycl::buffer val) { \ auto libkey = get_device_id(queue); \ @@ -77,15 +77,15 @@ sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhan queue, p_dmhandle, num_rows, num_cols, ld, dense_layout, val); \ } \ template <> \ - void init_dense_matrix(sycl::queue &queue, dense_matrix_handle_t *p_dmhandle, \ + void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - layout dense_layout, FP_TYPE *val) { \ + layout dense_layout, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].init_dense_matrix_usm##FP_SUFFIX(queue, p_dmhandle, num_rows, \ num_cols, ld, dense_layout, val); \ } \ template <> \ - void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, \ + void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ layout dense_layout, sycl::buffer val) { \ auto libkey = get_device_id(queue); \ @@ -93,9 +93,9 @@ sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhan queue, dmhandle, num_rows, num_cols, ld, dense_layout, val); \ } \ template <> \ - void set_dense_matrix_data(sycl::queue &queue, dense_matrix_handle_t dmhandle, \ + void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, \ std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, \ - layout dense_layout, FP_TYPE *val) { \ + layout dense_layout, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].set_dense_matrix_data_usm##FP_SUFFIX( \ queue, dmhandle, num_rows, num_cols, ld, dense_layout, val); \ @@ -103,8 +103,8 @@ sycl::event release_dense_vector(sycl::queue &queue, dense_vector_handle_t dvhan FOR_EACH_FP_TYPE(DEFINE_DENSE_MATRIX_FUNCS); #undef DEFINE_DENSE_MATRIX_FUNCS -sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhandle, - const std::vector &dependencies) { +sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhandle, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_dense_matrix(queue, dmhandle, dependencies); } @@ -112,7 +112,7 @@ sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhan // COO matrix #define DEFINE_COO_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ template <> \ - void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, \ + void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ sycl::buffer row_ind, sycl::buffer col_ind, \ sycl::buffer val) { \ @@ -121,15 +121,15 @@ sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhan queue, p_smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); \ } \ template <> \ - void init_coo_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, \ + void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ - INT_TYPE *row_ind, INT_TYPE *col_ind, FP_TYPE *val) { \ + INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].init_coo_matrix_usm##FP_SUFFIX##INT_SUFFIX( \ queue, p_smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); \ } \ template <> \ - void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, \ + void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ sycl::buffer row_ind, sycl::buffer col_ind, \ sycl::buffer val) { \ @@ -138,9 +138,9 @@ sycl::event release_dense_matrix(sycl::queue &queue, dense_matrix_handle_t dmhan queue, smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); \ } \ template <> \ - void set_coo_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, \ + void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ - INT_TYPE *row_ind, INT_TYPE *col_ind, FP_TYPE *val) { \ + INT_TYPE* row_ind, INT_TYPE* col_ind, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].set_coo_matrix_data_usm##FP_SUFFIX##INT_SUFFIX( \ queue, smhandle, num_rows, num_cols, nnz, index, row_ind, col_ind, val); \ @@ -151,7 +151,7 @@ FOR_EACH_FP_AND_INT_TYPE(DEFINE_COO_MATRIX_FUNCS); // CSR matrix #define DEFINE_INIT_CSR_MATRIX_FUNCS(FP_TYPE, FP_SUFFIX, INT_TYPE, INT_SUFFIX) \ template <> \ - void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, \ + void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ sycl::buffer row_ptr, sycl::buffer col_ind, \ sycl::buffer val) { \ @@ -160,15 +160,15 @@ FOR_EACH_FP_AND_INT_TYPE(DEFINE_COO_MATRIX_FUNCS); queue, p_smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); \ } \ template <> \ - void init_csr_matrix(sycl::queue &queue, matrix_handle_t *p_smhandle, std::int64_t num_rows, \ + void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ - INT_TYPE *row_ptr, INT_TYPE *col_ind, FP_TYPE *val) { \ + INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].init_csr_matrix_usm##FP_SUFFIX##INT_SUFFIX( \ queue, p_smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); \ } \ template <> \ - void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, \ + void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ sycl::buffer row_ptr, sycl::buffer col_ind, \ sycl::buffer val) { \ @@ -177,9 +177,9 @@ FOR_EACH_FP_AND_INT_TYPE(DEFINE_COO_MATRIX_FUNCS); queue, smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); \ } \ template <> \ - void set_csr_matrix_data(sycl::queue &queue, matrix_handle_t smhandle, std::int64_t num_rows, \ + void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int64_t num_rows, \ std::int64_t num_cols, std::int64_t nnz, index_base index, \ - INT_TYPE *row_ptr, INT_TYPE *col_ind, FP_TYPE *val) { \ + INT_TYPE* row_ptr, INT_TYPE* col_ind, FP_TYPE* val) { \ auto libkey = get_device_id(queue); \ function_tables[libkey].set_csr_matrix_data_usm##FP_SUFFIX##INT_SUFFIX( \ queue, smhandle, num_rows, num_cols, nnz, index, row_ptr, col_ind, val); \ @@ -188,142 +188,142 @@ FOR_EACH_FP_AND_INT_TYPE(DEFINE_INIT_CSR_MATRIX_FUNCS); #undef DEFINE_INIT_CSR_MATRIX_FUNCS // Common sparse matrix functions -sycl::event release_sparse_matrix(sycl::queue &queue, matrix_handle_t smhandle, - const std::vector &dependencies) { +sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_sparse_matrix(queue, smhandle, dependencies); } -bool set_matrix_property(sycl::queue &queue, matrix_handle_t smhandle, matrix_property property) { +bool set_matrix_property(sycl::queue& queue, matrix_handle_t smhandle, matrix_property property) { auto libkey = get_device_id(queue); return function_tables[libkey].set_matrix_property(queue, smhandle, property); } // SPMM -void init_spmm_descr(sycl::queue &queue, spmm_descr_t *p_spmm_descr) { +void init_spmm_descr(sycl::queue& queue, spmm_descr_t* p_spmm_descr) { auto libkey = get_device_id(queue); function_tables[libkey].init_spmm_descr(queue, p_spmm_descr); } -sycl::event release_spmm_descr(sycl::queue &queue, spmm_descr_t spmm_descr, - const std::vector &dependencies) { +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_spmm_descr(queue, spmm_descr, dependencies); } -void spmm_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { auto libkey = get_device_id(queue); function_tables[libkey].spmm_buffer_size(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, temp_buffer_size); } -void spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, sycl::buffer workspace) { auto libkey = get_device_id(queue); function_tables[libkey].spmm_optimize_buffer(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, workspace); } -sycl::event spmm_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, - oneapi::mkl::transpose opB, const void *alpha, matrix_view A_view, +sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, + oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, - const void *beta, dense_matrix_handle_t C_handle, spmm_alg alg, - spmm_descr_t spmm_descr, void *workspace, - const std::vector &dependencies) { + const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, + spmm_descr_t spmm_descr, void* workspace, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spmm_optimize_usm(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, workspace, dependencies); } -sycl::event spmm(sycl::queue &queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void *alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void *beta, dense_matrix_handle_t C_handle, +sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - const std::vector &dependencies) { + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spmm(queue, opA, opB, alpha, A_view, A_handle, B_handle, beta, C_handle, alg, spmm_descr, dependencies); } // SPMV -void init_spmv_descr(sycl::queue &queue, spmv_descr_t *p_spmv_descr) { +void init_spmv_descr(sycl::queue& queue, spmv_descr_t* p_spmv_descr) { auto libkey = get_device_id(queue); function_tables[libkey].init_spmv_descr(queue, p_spmv_descr); } -sycl::event release_spmv_descr(sycl::queue &queue, spmv_descr_t spmv_descr, - const std::vector &dependencies) { +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_spmv_descr(queue, spmv_descr, dependencies); } -void spmv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t &temp_buffer_size) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { auto libkey = get_device_id(queue); function_tables[libkey].spmv_buffer_size(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, temp_buffer_size); } -void spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, sycl::buffer workspace) { auto libkey = get_device_id(queue); function_tables[libkey].spmv_optimize_buffer(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, workspace); } -sycl::event spmv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, const void *beta, + dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, spmv_descr_t spmv_descr, - void *workspace, const std::vector &dependencies) { + void* workspace, const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spmv_optimize_usm(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, workspace, dependencies); } -sycl::event spmv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void *beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, const std::vector &dependencies) { + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spmv(queue, opA, alpha, A_view, A_handle, x_handle, beta, y_handle, alg, spmv_descr, dependencies); } // SPSV -void init_spsv_descr(sycl::queue &queue, spsv_descr_t *p_spsv_descr) { +void init_spsv_descr(sycl::queue& queue, spsv_descr_t* p_spsv_descr) { auto libkey = get_device_id(queue); function_tables[libkey].init_spsv_descr(queue, p_spsv_descr); } -sycl::event release_spsv_descr(sycl::queue &queue, spsv_descr_t spsv_descr, - const std::vector &dependencies) { +sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].release_spsv_descr(queue, spsv_descr, dependencies); } -void spsv_buffer_size(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - std::size_t &temp_buffer_size) { + std::size_t& temp_buffer_size) { auto libkey = get_device_id(queue); function_tables[libkey].spsv_buffer_size(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, temp_buffer_size); } -void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, sycl::buffer workspace) { @@ -332,21 +332,21 @@ void spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *a y_handle, alg, spsv_descr, workspace); } -sycl::event spsv_optimize(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void *workspace, - const std::vector &dependencies) { + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace, + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spsv_optimize_usm(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, workspace, dependencies); } -sycl::event spsv(sycl::queue &queue, oneapi::mkl::transpose opA, const void *alpha, +sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, - const std::vector &dependencies) { + const std::vector& dependencies) { auto libkey = get_device_id(queue); return function_tables[libkey].spsv(queue, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, spsv_descr, dependencies); diff --git a/src/sparse_blas/sycl_helper.hpp b/src/sparse_blas/sycl_helper.hpp index 67580159c..1a055b405 100644 --- a/src/sparse_blas/sycl_helper.hpp +++ b/src/sparse_blas/sycl_helper.hpp @@ -30,7 +30,7 @@ namespace oneapi::mkl::sparse::detail { /// Return whether a pointer is accessible on the host template -inline bool is_ptr_accessible_on_host(sycl::queue queue, const T *host_or_device_ptr) { +inline bool is_ptr_accessible_on_host(sycl::queue queue, const T* host_or_device_ptr) { auto alloc_type = sycl::get_pointer_type(host_or_device_ptr, queue.get_context()); return alloc_type == sycl::usm::alloc::host || alloc_type == sycl::usm::alloc::shared || alloc_type == sycl::usm::alloc::unknown; @@ -38,7 +38,7 @@ inline bool is_ptr_accessible_on_host(sycl::queue queue, const T *host_or_device /// Return a scalar on the host from a pointer to host or device memory template -inline T get_scalar_on_host(sycl::queue &queue, const T *host_or_device_ptr, +inline T get_scalar_on_host(sycl::queue& queue, const T* host_or_device_ptr, bool is_ptr_accessible_on_host) { if (is_ptr_accessible_on_host) { return *host_or_device_ptr; @@ -51,17 +51,17 @@ inline T get_scalar_on_host(sycl::queue &queue, const T *host_or_device_ptr, /// Submit the release of \p ptr in a host_task waiting on the dependencies template -sycl::event submit_release(sycl::queue &queue, T *ptr, - const std::vector &dependencies) { - return queue.submit([&](sycl::handler &cgh) { +sycl::event submit_release(sycl::queue& queue, T* ptr, + const std::vector& dependencies) { + return queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task([=]() { delete ptr; }); }); } /// Merge multiple event dependencies into one -inline sycl::event collapse_dependencies(sycl::queue &queue, - const std::vector &dependencies) { +inline sycl::event collapse_dependencies(sycl::queue& queue, + const std::vector& dependencies) { if (dependencies.empty()) { return {}; } @@ -69,7 +69,7 @@ inline sycl::event collapse_dependencies(sycl::queue &queue, return dependencies[0]; } - return queue.submit([&](sycl::handler &cgh) { + return queue.submit([&](sycl::handler& cgh) { cgh.depends_on(dependencies); cgh.host_task([=]() {}); }); diff --git a/tests/unit_tests/blas/batch/axpy_batch_stride.cpp b/tests/unit_tests/blas/batch/axpy_batch_stride.cpp index 9bb1406ef..e311237a1 100644 --- a/tests/unit_tests/blas/batch/axpy_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/axpy_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha, int64_t batch_size) { // Prepare data. int64_t n, i; @@ -77,19 +77,19 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::axpy(&n_ref, (fp_ref *)&alpha, (fp_ref *)x.data() + i * stride_x, &incx_ref, - (fp_ref *)y_ref.data() + i * stride_y, &incy_ref); + ::axpy(&n_ref, (fp_ref*)&alpha, (fp_ref*)x.data() + i * stride_x, &incx_ref, + (fp_ref*)y_ref.data() + i * stride_y, &incy_ref); } // Call DPC++ AXPY_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -133,17 +133,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -160,7 +160,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp } class AxpyBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(AxpyBatchStrideTests, RealSinglePrecision) { float alpha = 2.0; diff --git a/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp index 9ebc82abe..36f260e10 100644 --- a/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/axpy_batch_stride_usm.cpp @@ -43,20 +43,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp alpha, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -101,8 +101,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::axpy(&n_ref, (fp_ref *)&alpha, (fp_ref *)x.data() + i * stride_x, &incx_ref, - (fp_ref *)y_ref.data() + i * stride_y, &incy_ref); + ::axpy(&n_ref, (fp_ref*)&alpha, (fp_ref*)x.data() + i * stride_x, &incx_ref, + (fp_ref*)y_ref.data() + i * stride_y, &incy_ref); } // Call DPC++ AXPY_BATCH_STRIDE. @@ -140,17 +140,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -166,7 +166,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, fp } class AxpyBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(AxpyBatchStrideUsmTests, RealSinglePrecision) { float alpha = 2.0; diff --git a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp index 4dacf8ddb..a65367eb0 100644 --- a/tests/unit_tests/blas/batch/axpy_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/axpy_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -69,15 +69,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { std::vector dependencies; // Prepare data. - int64_t *n = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - int64_t *incx = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - int64_t *incy = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - fp *alpha = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * group_count, *dev, cxt); - int64_t *group_size = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* n = (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* incx = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* incy = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + fp* alpha = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * group_count, *dev, cxt); + int64_t* group_size = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); if ((n == NULL) || (incx == NULL) || (incy == NULL) || (alpha == NULL) || (group_size == NULL)) { @@ -104,12 +103,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - fp **x_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); - fp **y_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); - fp **y_ref_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); + fp** x_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); + fp** y_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); + fp** y_ref_array = + (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -124,11 +121,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_size_x = (1 + (n[i] - 1) * std::abs(incx[i])); total_size_y = (1 + (n[i] - 1) * std::abs(incy[i])); x_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt); y_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); y_ref_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); rand_vector(x_array[idx], n[i], incx[i]); rand_vector(y_array[idx], n[i], incy[i]); copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]); @@ -146,8 +143,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { n_ref = (int)n[i]; incx_ref = (int)incx[i]; incy_ref = (int)incy[i]; - ::axpy((const int *)&n_ref, (const fp_ref *)&alpha[i], (const fp_ref *)x_array[idx], - (const int *)&incx_ref, (fp_ref *)y_ref_array[idx], (const int *)&incy_ref); + ::axpy((const int*)&n_ref, (const fp_ref*)&alpha[i], (const fp_ref*)x_array[idx], + (const int*)&incx_ref, (fp_ref*)y_ref_array[idx], (const int*)&incy_ref); idx++; } } @@ -159,12 +156,12 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::axpy_batch( - main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count, + main_queue, n, alpha, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::axpy_batch( - main_queue, n, alpha, (const fp **)x_array, incx, y_array, incy, group_count, + main_queue, n, alpha, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; default: break; @@ -174,12 +171,12 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::axpy_batch, n, - alpha, (const fp **)x_array, incx, y_array, incy, + alpha, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::axpy_batch, n, - alpha, (const fp **)x_array, incx, y_array, incy, + alpha, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; default: break; @@ -187,13 +184,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { idx = 0; for (i = 0; i < group_count; i++) { for (j = 0; j < group_size[i]; j++) { @@ -214,7 +211,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPY_BATCH:\n" << error.what() << std::endl; } @@ -252,7 +249,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class AxpyBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(AxpyBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/copy_batch_stride.cpp b/tests/unit_tests/blas/batch/copy_batch_stride.cpp index a1da595f6..ff51e1c6d 100644 --- a/tests/unit_tests/blas/batch/copy_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/copy_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { // Prepare data. int64_t n, i; @@ -76,19 +76,19 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::copy(&n_ref, (fp_ref *)x.data() + i * stride_x, &incx_ref, - (fp_ref *)y_ref.data() + i * stride_y, &incy_ref); + ::copy(&n_ref, (fp_ref*)x.data() + i * stride_x, &incx_ref, + (fp_ref*)y_ref.data() + i * stride_y, &incy_ref); } // Call DPC++ COPY_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during COPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -130,17 +130,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during COPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of COPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -157,7 +157,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } class CopyBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(CopyBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 15)); diff --git a/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp index 569293be1..062054d55 100644 --- a/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/copy_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during COPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -100,8 +100,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::copy(&n_ref, (fp_ref *)x.data() + i * stride_x, &incx_ref, - (fp_ref *)y_ref.data() + i * stride_y, &incy_ref); + ::copy(&n_ref, (fp_ref*)x.data() + i * stride_x, &incx_ref, + (fp_ref*)y_ref.data() + i * stride_y, &incy_ref); } // Call DPC++ COPY_BATCH_STRIDE. @@ -139,17 +139,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during COPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of COPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -165,7 +165,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } class CopyBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(CopyBatchStrideUsmTests, RealSinglePrecision) { float alpha = 2.0; diff --git a/tests/unit_tests/blas/batch/copy_batch_usm.cpp b/tests/unit_tests/blas/batch/copy_batch_usm.cpp index 8cac23704..ce051a046 100644 --- a/tests/unit_tests/blas/batch/copy_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/copy_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during COPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -69,14 +69,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { std::vector dependencies; // Prepare data. - int64_t *n = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - int64_t *incx = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - int64_t *incy = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); - int64_t *group_size = - (int64_t *)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* n = (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* incx = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* incy = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); + int64_t* group_size = + (int64_t*)oneapi::mkl::malloc_shared(64, sizeof(int64_t) * group_count, *dev, cxt); if ((n == NULL) || (incx == NULL) || (incy == NULL) || (group_size == NULL)) { std::cout << "Error cannot allocate input arrays\n"; @@ -100,12 +99,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - fp **x_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); - fp **y_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); - fp **y_ref_array = - (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * total_batch_count, *dev, cxt); + fp** x_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); + fp** y_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); + fp** y_ref_array = + (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * total_batch_count, *dev, cxt); if ((x_array == NULL) || (y_array == NULL) || (y_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -120,11 +117,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_size_x = (1 + (n[i] - 1) * std::abs(incx[i])); total_size_y = (1 + (n[i] - 1) * std::abs(incy[i])); x_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_x, *dev, cxt); y_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); y_ref_array[idx] = - (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); + (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * total_size_y, *dev, cxt); rand_vector(x_array[idx], n[i], incx[i]); rand_vector(y_array[idx], n[i], incy[i]); copy_vector(y_array[idx], n[i], incy[i], y_ref_array[idx]); @@ -142,8 +139,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { n_ref = (int)n[i]; incx_ref = (int)incx[i]; incy_ref = (int)incy[i]; - ::copy((const int *)&n_ref, (const fp_ref *)x_array[idx], (const int *)&incx_ref, - (fp_ref *)y_ref_array[idx], (const int *)&incy_ref); + ::copy((const int*)&n_ref, (const fp_ref*)x_array[idx], (const int*)&incx_ref, + (fp_ref*)y_ref_array[idx], (const int*)&incy_ref); idx++; } } @@ -155,11 +152,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::copy_batch( - main_queue, n, (const fp **)x_array, incx, y_array, incy, group_count, + main_queue, n, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; case oneapi::mkl::layout::row_major: - done = oneapi::mkl::blas::row_major::copy_batch(main_queue, n, (const fp **)x_array, + done = oneapi::mkl::blas::row_major::copy_batch(main_queue, n, (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; @@ -170,12 +167,12 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::copy_batch, n, - (const fp **)x_array, incx, y_array, incy, group_count, + (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::copy_batch, n, - (const fp **)x_array, incx, y_array, incy, group_count, + (const fp**)x_array, incx, y_array, incy, group_count, group_size, dependencies); break; default: break; @@ -183,13 +180,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during COPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { idx = 0; for (i = 0; i < group_count; i++) { for (j = 0; j < group_size[i]; j++) { @@ -209,7 +206,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of COPY_BATCH:\n" << error.what() << std::endl; } @@ -246,7 +243,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class CopyBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(CopyBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp b/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp index bb642c3ee..5e4bd82d8 100644 --- a/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/dgmm_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx, int64_t batch_size) { // Prepare data. int64_t m, n; @@ -90,21 +90,20 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, for (i = 0; i < batch_size_ref; i++) { ::dgmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right), - (const int *)&m_ref, (const int *)&n_ref, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (const fp_ref *)(x.data() + stride_x * i), - (const int *)&incx_ref, (fp_ref *)(C_ref.data() + stride_c * i), - (const int *)&ldc_ref); + (const int*)&m_ref, (const int*)&n_ref, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (const fp_ref*)(x.data() + stride_x * i), + (const int*)&incx_ref, (fp_ref*)(C_ref.data() + stride_c * i), (const int*)&ldc_ref); } // Call DPC++ DGMM_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -149,17 +148,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DGMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DGMM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -177,7 +176,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, } class DgmmBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(DgmmBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), diff --git a/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp index bb9cf0df3..7b6389b0f 100644 --- a/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/dgmm_batch_stride_usm.cpp @@ -43,20 +43,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, int64_t incx, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -115,10 +115,9 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, for (i = 0; i < batch_size_ref; i++) { ::dgmm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right), - (const int *)&m_ref, (const int *)&n_ref, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (const fp_ref *)(x.data() + stride_x * i), - (const int *)&incx_ref, (fp_ref *)(C_ref.data() + stride_c * i), - (const int *)&ldc_ref); + (const int*)&m_ref, (const int*)&n_ref, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (const fp_ref*)(x.data() + stride_x * i), + (const int*)&incx_ref, (fp_ref*)(C_ref.data() + stride_c * i), (const int*)&ldc_ref); } // Call DPC++ DGMM_BATCH_STRIDE. @@ -156,17 +155,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DGMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DGMM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -182,7 +181,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::side left_right, } class DgmmBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(DgmmBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), diff --git a/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp b/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp index 1f568580f..87b127358 100644 --- a/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/dgmm_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DGMM_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -102,9 +102,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector a_array(uafpp), x_array(uafpp), c_array(uafpp), - c_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector a_array(uafpp), x_array(uafpp), c_array(uafpp), c_ref_array(uafpp); a_array.resize(total_batch_count); x_array.resize(total_batch_count); c_array.resize(total_batch_count); @@ -117,10 +116,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { size_x = 1 + (x_len - 1) * std::abs(incx[i]); size_c = (layout == oneapi::mkl::layout::col_major) ? ldc[i] * n[i] : ldc[i] * m[i]; for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); - x_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt); - c_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); - c_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); + a_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); + x_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt); + c_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); + c_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); rand_matrix(a_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], lda[i]); rand_vector(x_array[idx], x_len, incx[i]); rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i]); @@ -132,15 +131,15 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { // Call reference DGMM_BATCH. using fp_ref = typename ref_type_info::type; - int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *incx_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* m_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* n_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* lda_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* incx_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* ldc_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* group_size_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - CBLAS_SIDE *left_right_ref = - (CBLAS_SIDE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count); + CBLAS_SIDE* left_right_ref = + (CBLAS_SIDE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count); if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (incx_ref == NULL) || (ldc_ref == NULL) || (left_right_ref == NULL) || (group_size_ref == NULL)) { @@ -174,10 +173,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { ldc_ref[i] = (int)ldc[i]; group_size_ref[i] = (int)group_size[i]; for (j = 0; j < group_size_ref[i]; j++) { - ::dgmm(convert_to_cblas_layout(layout), left_right_ref[i], (const int *)&m_ref[i], - (const int *)&n_ref[i], (const fp_ref *)a_array[idx], (const int *)&lda_ref[i], - (const fp_ref *)x_array[idx], (const int *)&incx_ref[i], - (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]); + ::dgmm(convert_to_cblas_layout(layout), left_right_ref[i], (const int*)&m_ref[i], + (const int*)&n_ref[i], (const fp_ref*)a_array[idx], (const int*)&lda_ref[i], + (const fp_ref*)x_array[idx], (const int*)&incx_ref[i], (fp_ref*)c_ref_array[idx], + (const int*)&ldc_ref[i]); idx++; } } @@ -189,14 +188,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::dgmm_batch( - main_queue, &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], &lda[0], - (const fp **)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, + main_queue, &left_right[0], &m[0], &n[0], (const fp**)&a_array[0], &lda[0], + (const fp**)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::dgmm_batch( - main_queue, &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], &lda[0], - (const fp **)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, + main_queue, &left_right[0], &m[0], &n[0], (const fp**)&a_array[0], &lda[0], + (const fp**)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; default: break; @@ -206,14 +205,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::dgmm_batch, - &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], - &lda[0], (const fp **)&x_array[0], &incx[0], &c_array[0], + &left_right[0], &m[0], &n[0], (const fp**)&a_array[0], + &lda[0], (const fp**)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::dgmm_batch, - &left_right[0], &m[0], &n[0], (const fp **)&a_array[0], - &lda[0], (const fp **)&x_array[0], &incx[0], &c_array[0], + &left_right[0], &m[0], &n[0], (const fp**)&a_array[0], + &lda[0], (const fp**)&x_array[0], &incx[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; default: break; @@ -221,13 +220,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DGMM_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::aligned_free(m_ref); oneapi::mkl::aligned_free(n_ref); oneapi::mkl::aligned_free(lda_ref); @@ -248,7 +247,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DGMM_BATCH:\n" << error.what() << std::endl; } @@ -285,7 +284,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class DgmmBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(DgmmBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp index 5241cb822..3dad54f33 100644 --- a/tests/unit_tests/blas/batch/gemm_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/gemm_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Prepare data. int64_t m, n, k; int64_t lda, ldb, ldc; @@ -135,23 +135,22 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { for (i = 0; i < batch_size_ref; i++) { ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), - convert_to_cblas_trans(transb), (const int *)&m_ref, (const int *)&n_ref, - (const int *)&k_ref, (const fp_ref *)&alpha, - (const fp_ref *)(A_ref.data() + stride_a * i), (const int *)&lda_ref, - (const fp_ref *)(B_ref.data() + stride_b * i), (const int *)&ldb_ref, - (const fp_ref *)&beta, (fp_ref *)(C_ref.data() + stride_c * i), - (const int *)&ldc_ref); + convert_to_cblas_trans(transb), (const int*)&m_ref, (const int*)&n_ref, + (const int*)&k_ref, (const fp_ref*)&alpha, + (const fp_ref*)(A_ref.data() + stride_a * i), (const int*)&lda_ref, + (const fp_ref*)(B_ref.data() + stride_b * i), (const int*)&ldb_ref, + (const fp_ref*)&beta, (fp_ref*)(C_ref.data() + stride_c * i), (const int*)&ldc_ref); } // Call DPC++ GEMM_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -199,17 +198,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { #endif main_queue.wait_and_throw(); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -232,7 +231,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class GemmBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemmBatchStrideTests, RealHalfPrecision) { EXPECT_TRUEORSKIP((test( diff --git a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp index 97f2dd086..12a5a4f61 100644 --- a/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/gemm_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -136,10 +136,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { C_ref.resize(stride_c * batch_size); C_cast_ref.resize(stride_c * batch_size); - Ta **a_array = (Ta **)oneapi::mkl::malloc_shared(64, sizeof(Ta *) * batch_size, *dev, cxt); - Tb **b_array = (Tb **)oneapi::mkl::malloc_shared(64, sizeof(Tb *) * batch_size, *dev, cxt); - Tc **c_array = (Tc **)oneapi::mkl::malloc_shared(64, sizeof(Tc *) * batch_size, *dev, cxt); - Ts **c_ref_array = (Ts **)oneapi::mkl::malloc_shared(64, sizeof(Ts *) * batch_size, *dev, cxt); + Ta** a_array = (Ta**)oneapi::mkl::malloc_shared(64, sizeof(Ta*) * batch_size, *dev, cxt); + Tb** b_array = (Tb**)oneapi::mkl::malloc_shared(64, sizeof(Tb*) * batch_size, *dev, cxt); + Tc** c_array = (Tc**)oneapi::mkl::malloc_shared(64, sizeof(Tc*) * batch_size, *dev, cxt); + Ts** c_ref_array = (Ts**)oneapi::mkl::malloc_shared(64, sizeof(Ts*) * batch_size, *dev, cxt); if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -181,12 +181,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { ::gemm(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), - convert_to_cblas_trans(transb), (const int *)&m_ref, (const int *)&n_ref, - (const int *)&k_ref, (const fp_ref *)&alpha, - (const fp_ref *)(A_ref.data() + stride_a * i), (const int *)&lda_ref, - (const fp_ref *)(B_ref.data() + stride_b * i), (const int *)&ldb_ref, - (const fp_ref *)&beta, (fp_ref *)(C_ref.data() + stride_c * i), - (const int *)&ldc_ref); + convert_to_cblas_trans(transb), (const int*)&m_ref, (const int*)&n_ref, + (const int*)&k_ref, (const fp_ref*)&alpha, + (const fp_ref*)(A_ref.data() + stride_a * i), (const int*)&lda_ref, + (const fp_ref*)(B_ref.data() + stride_b * i), (const int*)&ldb_ref, + (const fp_ref*)&beta, (fp_ref*)(C_ref.data() + stride_c * i), (const int*)&ldc_ref); } // Call DPC++ GEMM_BATCH_STRIDE. @@ -226,13 +225,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { main_queue.wait_and_throw(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::free_shared(a_array, cxt); oneapi::mkl::free_shared(b_array, cxt); oneapi::mkl::free_shared(c_array, cxt); @@ -240,7 +239,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -267,7 +266,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class GemmBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemmBatchStrideUsmTests, RealHalfPrecision) { EXPECT_TRUEORSKIP((test( diff --git a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp index a651f9ae3..a78bbb26f 100644 --- a/tests/unit_tests/blas/batch/gemm_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/gemm_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -126,14 +126,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uaTap = usm_allocator(cxt, *dev); - auto uaTbp = usm_allocator(cxt, *dev); - auto uaTcp = usm_allocator(cxt, *dev); - auto uaTsp = usm_allocator(cxt, *dev); - vector a_array(uaTap); - vector b_array(uaTbp); - vector c_array(uaTcp), c_cast_ref_array(uaTcp); - vector a_ref_array(uaTsp), b_ref_array(uaTsp), c_ref_array(uaTsp); + auto uaTap = usm_allocator(cxt, *dev); + auto uaTbp = usm_allocator(cxt, *dev); + auto uaTcp = usm_allocator(cxt, *dev); + auto uaTsp = usm_allocator(cxt, *dev); + vector a_array(uaTap); + vector b_array(uaTbp); + vector c_array(uaTcp), c_cast_ref_array(uaTcp); + vector a_ref_array(uaTsp), b_ref_array(uaTsp), c_ref_array(uaTsp); a_array.resize(total_batch_count); b_array.resize(total_batch_count); c_array.resize(total_batch_count); @@ -158,14 +158,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { default: break; } for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (Ta *)oneapi::mkl::malloc_shared(64, sizeof(Ta) * size_a, *dev, cxt); - b_array[idx] = (Tb *)oneapi::mkl::malloc_shared(64, sizeof(Tb) * size_b, *dev, cxt); - c_array[idx] = (Tc *)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt); - a_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_a, *dev, cxt); - b_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_b, *dev, cxt); + a_array[idx] = (Ta*)oneapi::mkl::malloc_shared(64, sizeof(Ta) * size_a, *dev, cxt); + b_array[idx] = (Tb*)oneapi::mkl::malloc_shared(64, sizeof(Tb) * size_b, *dev, cxt); + c_array[idx] = (Tc*)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt); + a_ref_array[idx] = (Ts*)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_a, *dev, cxt); + b_ref_array[idx] = (Ts*)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_b, *dev, cxt); c_cast_ref_array[idx] = - (Tc *)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt); - c_ref_array[idx] = (Ts *)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_c, *dev, cxt); + (Tc*)oneapi::mkl::malloc_shared(64, sizeof(Tc) * size_c, *dev, cxt); + c_ref_array[idx] = (Ts*)oneapi::mkl::malloc_shared(64, sizeof(Ts) * size_c, *dev, cxt); rand_matrix(a_array[idx], layout, transa[i], m[i], k[i], lda[i]); rand_matrix(b_array[idx], layout, transb[i], k[i], n[i], ldb[i]); rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldc[i]); @@ -179,18 +179,18 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { // Call reference GEMM_BATCH. using fp_ref = typename ref_type_info::type; - int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *k_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *ldb_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - - CBLAS_TRANSPOSE *transa_ref = - (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); - CBLAS_TRANSPOSE *transb_ref = - (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + int* m_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* n_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* k_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* lda_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* ldb_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* ldc_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* group_size_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + + CBLAS_TRANSPOSE* transa_ref = + (CBLAS_TRANSPOSE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + CBLAS_TRANSPOSE* transb_ref = + (CBLAS_TRANSPOSE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); if ((m_ref == NULL) || (n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) || (ldb_ref == NULL) || (ldc_ref == NULL) || (transa_ref == NULL) || (transb_ref == NULL) || @@ -233,11 +233,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { group_size_ref[i] = (int)group_size[i]; for (j = 0; j < group_size_ref[i]; j++) { ::gemm(convert_to_cblas_layout(layout), transa_ref[i], transb_ref[i], - (const int *)&m_ref[i], (const int *)&n_ref[i], (const int *)&k_ref[i], - (const fp_ref *)&alpha[i], (const fp_ref *)a_ref_array[idx], - (const int *)&lda_ref[i], (const fp_ref *)b_ref_array[idx], - (const int *)&ldb_ref[i], (const fp_ref *)&beta[i], (fp_ref *)c_ref_array[idx], - (const int *)&ldc_ref[i]); + (const int*)&m_ref[i], (const int*)&n_ref[i], (const int*)&k_ref[i], + (const fp_ref*)&alpha[i], (const fp_ref*)a_ref_array[idx], + (const int*)&lda_ref[i], (const fp_ref*)b_ref_array[idx], + (const int*)&ldb_ref[i], (const fp_ref*)&beta[i], (fp_ref*)c_ref_array[idx], + (const int*)&ldc_ref[i]); idx++; } } @@ -250,13 +250,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::gemm_batch( main_queue, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0], - (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0], &ldb[0], &beta[0], + (const Ta**)&a_array[0], &lda[0], (const Tb**)&b_array[0], &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::gemm_batch( main_queue, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0], - (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0], &ldb[0], &beta[0], + (const Ta**)&a_array[0], &lda[0], (const Tb**)&b_array[0], &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; default: break; @@ -267,14 +267,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemm_batch, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0], - (const Ta **)&a_array[0], &lda[0], (const Tb **)&b_array[0], + (const Ta**)&a_array[0], &lda[0], (const Tb**)&b_array[0], &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemm_batch, &transa[0], &transb[0], &m[0], &n[0], &k[0], &alpha[0], - (const Ta **)&a_array[0], &lda[0], (const Ta **)&b_array[0], + (const Ta**)&a_array[0], &lda[0], (const Ta**)&b_array[0], &ldb[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; @@ -283,13 +283,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait_and_throw(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMM_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::aligned_free(m_ref); oneapi::mkl::aligned_free(n_ref); oneapi::mkl::aligned_free(k_ref); @@ -315,7 +315,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMM_BATCH:\n" << error.what() << std::endl; } @@ -364,7 +364,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class GemmBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemmBatchUsmTests, RealHalfPrecision) { EXPECT_TRUEORSKIP((test( diff --git a/tests/unit_tests/blas/batch/gemv_batch_stride.cpp b/tests/unit_tests/blas/batch/gemv_batch_stride.cpp index bd92f70ca..f50686c13 100644 --- a/tests/unit_tests/blas/batch/gemv_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/gemv_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { // Prepare data. int64_t m, n; int64_t lda; @@ -103,23 +103,22 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int *)&m_ref, - (const int *)&n_ref, (const fp_ref *)&alpha, - (const fp_ref *)(A.data() + stride_a * i), (const int *)&lda_ref, - (const fp_ref *)(x.data() + stride_x * i), (const int *)&incx_ref, - (const fp_ref *)&beta, (fp_ref *)(y_ref.data() + stride_y * i), - (const int *)&incy_ref); + ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int*)&m_ref, + (const int*)&n_ref, (const fp_ref*)&alpha, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (const fp_ref*)(x.data() + stride_x * i), + (const int*)&incx_ref, (const fp_ref*)&beta, (fp_ref*)(y_ref.data() + stride_y * i), + (const int*)&incy_ref); } // Call DPC++ GEMV_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMV_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -164,17 +163,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMV_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMV_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -192,7 +191,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } class GemvBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemvBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5)); diff --git a/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp index d6eb47887..a61d7d318 100644 --- a/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/gemv_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -128,10 +128,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { - ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int *)&m_ref, - (const int *)&n_ref, (const fp_ref *)&alpha, (const fp_ref *)&A[stride_a * i], - (const int *)&lda_ref, (const fp_ref *)&x[stride_x * i], (const int *)&incx_ref, - (const fp_ref *)&beta, (fp_ref *)&y_ref[stride_y * i], (const int *)&incy_ref); + ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), (const int*)&m_ref, + (const int*)&n_ref, (const fp_ref*)&alpha, (const fp_ref*)&A[stride_a * i], + (const int*)&lda_ref, (const fp_ref*)&x[stride_x * i], (const int*)&incx_ref, + (const fp_ref*)&beta, (fp_ref*)&y_ref[stride_y * i], (const int*)&incy_ref); } // Call DPC++ GEMV_BATCH_STRIDE. @@ -171,17 +171,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMV_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMV_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -197,7 +197,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t incx, int64_t incy, in } class GemvBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemvBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 2, 3, 5)); diff --git a/tests/unit_tests/blas/batch/gemv_batch_usm.cpp b/tests/unit_tests/blas/batch/gemv_batch_usm.cpp index 4ad661f5b..2d257d0be 100644 --- a/tests/unit_tests/blas/batch/gemv_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/gemv_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMV_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -119,9 +119,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector a_array(uafpp), x_array(uafpp), y_array(uafpp), - y_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector a_array(uafpp), x_array(uafpp), y_array(uafpp), y_ref_array(uafpp); a_array.resize(total_batch_count); x_array.resize(total_batch_count); y_array.resize(total_batch_count); @@ -135,10 +134,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { size_x = 1 + (x_len - 1) * std::abs(incx[i]); size_y = 1 + (y_len - 1) * std::abs(incy[i]); for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); - x_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt); - y_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt); - y_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt); + a_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); + x_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_x, *dev, cxt); + y_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt); + y_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_y, *dev, cxt); rand_matrix(a_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], lda[i]); rand_vector(x_array[idx], x_len, incx[i]); rand_vector(y_array[idx], y_len, incy[i]); @@ -149,15 +148,15 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { // Call reference GEMV_BATCH. using fp_ref = typename ref_type_info::type; - int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *incx_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *incy_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* m_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* n_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* lda_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* incx_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* incy_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* group_size_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - CBLAS_TRANSPOSE *transa_ref = - (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + CBLAS_TRANSPOSE* transa_ref = + (CBLAS_TRANSPOSE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (incx_ref == NULL) || (incy_ref == NULL) || (transa_ref == NULL) || (group_size_ref == NULL)) { @@ -191,11 +190,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { incy_ref[i] = (int)incy[i]; group_size_ref[i] = (int)group_size[i]; for (j = 0; j < group_size_ref[i]; j++) { - ::gemv(convert_to_cblas_layout(layout), transa_ref[i], (const int *)&m_ref[i], - (const int *)&n_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx], - (const int *)&lda_ref[i], (const fp_ref *)x_array[idx], - (const int *)&incx_ref[i], (const fp_ref *)&beta[i], (fp_ref *)y_ref_array[idx], - (const int *)&incy_ref[i]); + ::gemv(convert_to_cblas_layout(layout), transa_ref[i], (const int*)&m_ref[i], + (const int*)&n_ref[i], (const fp_ref*)&alpha[i], (const fp_ref*)a_array[idx], + (const int*)&lda_ref[i], (const fp_ref*)x_array[idx], (const int*)&incx_ref[i], + (const fp_ref*)&beta[i], (fp_ref*)y_ref_array[idx], (const int*)&incy_ref[i]); idx++; } } @@ -207,14 +205,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { switch (layout) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::gemv_batch( - main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp **)&a_array[0], - &lda[0], (const fp **)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0], + main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp**)&a_array[0], + &lda[0], (const fp**)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::gemv_batch( - main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp **)&a_array[0], - &lda[0], (const fp **)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0], + main_queue, &transa[0], &m[0], &n[0], &alpha[0], (const fp**)&a_array[0], + &lda[0], (const fp**)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0], group_count, &group_size[0], dependencies); break; default: break; @@ -225,29 +223,28 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::gemv_batch, &transa[0], &m[0], &n[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], (const fp **)&x_array[0], + (const fp**)&a_array[0], &lda[0], (const fp**)&x_array[0], &incx[0], &beta[0], &y_array[0], &incy[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: - TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::gemv_batch, - &transa[0], &m[0], &n[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], (const fp **)&x_array[0], - &incx[0], &beta[0], &y_array[0], &incy[0], group_count, - &group_size[0], dependencies); + TEST_RUN_BLAS_CT_SELECT( + main_queue, oneapi::mkl::blas::row_major::gemv_batch, &transa[0], &m[0], &n[0], + &alpha[0], (const fp**)&a_array[0], &lda[0], (const fp**)&x_array[0], &incx[0], + &beta[0], &y_array[0], &incy[0], group_count, &group_size[0], dependencies); break; default: break; } main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMV_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::aligned_free(m_ref); oneapi::mkl::aligned_free(n_ref); oneapi::mkl::aligned_free(lda_ref); @@ -268,7 +265,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMV_BATCH:\n" << error.what() << std::endl; } @@ -306,7 +303,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class GemvBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemvBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp index ac8bbb2b4..a6e9a6fe5 100644 --- a/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/imatcopy_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -101,11 +101,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -146,17 +146,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of IMATCOPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -171,7 +171,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class ImatcopyBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(ImatcopyBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp index b3099d309..db40e3a1f 100644 --- a/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/imatcopy_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -103,8 +103,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { AB.resize(stride * batch_size); AB_ref.resize(stride * batch_size); - fp **ab_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **ab_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); + fp** ab_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** ab_ref_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); if ((ab_array == NULL) || (ab_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; oneapi::mkl::free_shared(ab_array, cxt); @@ -166,19 +166,19 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::free_shared(ab_array, cxt); oneapi::mkl::free_shared(ab_ref_array, cxt); return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of IMATCOPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -194,7 +194,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class ImatcopyBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(ImatcopyBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp b/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp index 74c9881af..d203f2440 100644 --- a/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/imatcopy_batch_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during IMATCOPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -103,8 +103,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector ab_array(uafpp), ab_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector ab_array(uafpp), ab_ref_array(uafpp); ab_array.resize(total_batch_count); ab_ref_array.resize(total_batch_count); @@ -126,8 +126,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } size = std::max(size_a, size_b); for (j = 0; j < group_size[i]; j++) { - ab_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt); - ab_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt); + ab_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt); + ab_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size, *dev, cxt); rand_matrix(ab_array[idx], oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size, 1, size); copy_matrix(ab_array[idx], oneapi::mkl::layout::col_major, @@ -187,13 +187,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during IMATCOPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { idx = 0; for (i = 0; i < group_count; i++) { for (j = 0; j < group_size[i]; j++) { @@ -205,7 +205,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of IMATCOPY_BATCH:\n" << error.what() << std::endl; } @@ -249,7 +249,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class ImatcopyBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(ImatcopyBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp b/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp index cc20d0e3b..f036d0bbb 100644 --- a/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/omatadd_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Prepare data. int64_t m, n; int64_t lda, ldb, ldc; @@ -111,11 +111,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATADD_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -162,17 +162,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATADD_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATADD_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -187,7 +187,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class OmataddBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmataddBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp index 7388084cb..59cd4ced3 100644 --- a/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/omatadd_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATADD_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -110,10 +110,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { C.resize(stride_c * batch_size); C_ref.resize(stride_c * batch_size); - fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **b_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **c_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **c_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); + fp** a_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** b_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** c_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** c_ref_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); if ((a_array == NULL) || (b_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -188,13 +188,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATADD_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::free_shared(a_array, cxt); oneapi::mkl::free_shared(b_array, cxt); oneapi::mkl::free_shared(c_array, cxt); @@ -202,7 +202,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATADD_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -220,7 +220,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class OmataddBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmataddBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp index d08329fc6..16b407890 100644 --- a/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/omatcopy_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -101,11 +101,11 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -149,17 +149,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -174,7 +174,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class OmatcopyBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmatcopyBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp index 7479b57db..9533a3030 100644 --- a/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/omatcopy_batch_stride_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -105,9 +105,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { B.resize(stride_b * batch_size); B_ref.resize(stride_b * batch_size); - fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **b_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **b_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); + fp** a_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** b_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** b_ref_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); if ((a_array == NULL) || (b_array == NULL) || (b_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -175,20 +175,20 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::free_shared(a_array, cxt); oneapi::mkl::free_shared(b_array, cxt); oneapi::mkl::free_shared(b_ref_array, cxt); return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -205,7 +205,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class OmatcopyBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmatcopyBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp b/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp index 7f1e4a103..e0eb3feaa 100644 --- a/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/omatcopy_batch_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -103,8 +103,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector a_array(uafpp), b_array(uafpp), b_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector a_array(uafpp), b_array(uafpp), b_ref_array(uafpp); a_array.resize(total_batch_count); b_array.resize(total_batch_count); @@ -126,9 +126,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { default: break; } for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); - b_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); - b_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); + a_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); + b_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); + b_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); rand_matrix(a_array[idx], oneapi::mkl::layout::col_major, oneapi::mkl::transpose::nontrans, size_a, 1, size_a); rand_matrix(b_array[idx], oneapi::mkl::layout::col_major, @@ -161,14 +161,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::omatcopy_batch( main_queue, trans.data(), m.data(), n.data(), alpha.data(), - (const fp **)a_array.data(), lda.data(), b_array.data(), ldb.data(), - group_count, group_size.data(), dependencies); + (const fp**)a_array.data(), lda.data(), b_array.data(), ldb.data(), group_count, + group_size.data(), dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::omatcopy_batch( main_queue, trans.data(), m.data(), n.data(), alpha.data(), - (const fp **)a_array.data(), lda.data(), b_array.data(), ldb.data(), - group_count, group_size.data(), dependencies); + (const fp**)a_array.data(), lda.data(), b_array.data(), ldb.data(), group_count, + group_size.data(), dependencies); break; default: break; } @@ -178,13 +178,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::omatcopy_batch, trans.data(), m.data(), n.data(), alpha.data(), - (const fp **)a_array.data(), lda.data(), b_array.data(), + (const fp**)a_array.data(), lda.data(), b_array.data(), ldb.data(), group_count, group_size.data(), dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::omatcopy_batch, trans.data(), m.data(), n.data(), alpha.data(), - (const fp **)a_array.data(), lda.data(), b_array.data(), + (const fp**)a_array.data(), lda.data(), b_array.data(), ldb.data(), group_count, group_size.data(), dependencies); break; default: break; @@ -192,13 +192,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { idx = 0; for (i = 0; i < group_count; i++) { for (j = 0; j < group_size[i]; j++) { @@ -211,7 +211,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY_BATCH:\n" << error.what() << std::endl; } @@ -255,7 +255,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class OmatcopyBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmatcopyBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/syrk_batch_stride.cpp b/tests/unit_tests/blas/batch/syrk_batch_stride.cpp index 58dc4d7dc..aeb33c42e 100644 --- a/tests/unit_tests/blas/batch/syrk_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/syrk_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Prepare data. int64_t n, k; int64_t lda, ldc; @@ -67,9 +67,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { upper_lower = (oneapi::mkl::uplo)(std::rand() % 2); if ((std::is_same::value) || (std::is_same::value)) { - trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans - : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans - : oneapi::mkl::transpose::conjtrans; + trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans + : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans + : oneapi::mkl::transpose::conjtrans; } else { trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans @@ -110,21 +110,21 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { for (i = 0; i < batch_size_ref; i++) { ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), - convert_to_cblas_trans(trans), (const int *)&n_ref, (const int *)&k_ref, - (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (const fp_ref *)&beta, - (fp_ref *)(C_ref.data() + stride_c * i), (const int *)&ldc_ref); + convert_to_cblas_trans(trans), (const int*)&n_ref, (const int*)&k_ref, + (const fp_ref*)&alpha, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (const fp_ref*)&beta, (fp_ref*)(C_ref.data() + stride_c * i), + (const int*)&ldc_ref); } // Call DPC++ SYRK_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -168,17 +168,17 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYRK_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYRK_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -194,7 +194,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class SyrkBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SyrkBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp index 31aa09b79..b1f66fa07 100644 --- a/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/syrk_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { +int test(device* dev, oneapi::mkl::layout layout, int64_t batch_size) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -86,9 +86,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { beta = rand_scalar(); upper_lower = (oneapi::mkl::uplo)(std::rand() % 2); if ((std::is_same::value) || (std::is_same::value)) { - trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans - : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans - : oneapi::mkl::transpose::conjtrans; + trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans + : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans + : oneapi::mkl::transpose::conjtrans; } else { trans = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans @@ -116,9 +116,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { C.resize(stride_c * batch_size); C_ref.resize(stride_c * batch_size); - fp **a_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **c_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); - fp **c_ref_array = (fp **)oneapi::mkl::malloc_shared(64, sizeof(fp *) * batch_size, *dev, cxt); + fp** a_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** c_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); + fp** c_ref_array = (fp**)oneapi::mkl::malloc_shared(64, sizeof(fp*) * batch_size, *dev, cxt); if ((a_array == NULL) || (c_array == NULL) || (c_ref_array == NULL)) { std::cout << "Error cannot allocate arrays of pointers\n"; @@ -150,10 +150,10 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { int batch_size_ref = (int)batch_size; for (i = 0; i < batch_size_ref; i++) { ::syrk(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), - convert_to_cblas_trans(trans), (const int *)&n_ref, (const int *)&k_ref, - (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (const fp_ref *)&beta, - (fp_ref *)(C_ref.data() + stride_c * i), (const int *)&ldc_ref); + convert_to_cblas_trans(trans), (const int*)&n_ref, (const int*)&k_ref, + (const fp_ref*)&alpha, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (const fp_ref*)&beta, (fp_ref*)(C_ref.data() + stride_c * i), + (const int*)&ldc_ref); } // Call DPC++ SYRK_BATCH_STRIDE. @@ -191,20 +191,20 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYRK_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::free_shared(a_array, cxt); oneapi::mkl::free_shared(c_array, cxt); oneapi::mkl::free_shared(c_ref_array, cxt); return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYRK_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -221,7 +221,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t batch_size) { } class SyrkBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SyrkBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/syrk_batch_usm.cpp b/tests/unit_tests/blas/batch/syrk_batch_usm.cpp index 36d0d6dd5..b331b4c66 100644 --- a/tests/unit_tests/blas/batch/syrk_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/syrk_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYRK_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -106,10 +106,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { beta[i] = rand_scalar(); upper_lower[i] = (oneapi::mkl::uplo)(std::rand() % 2); if ((std::is_same::value) || (std::is_same::value)) { - trans[i] = (std::rand() % 2) == 0 - ? oneapi::mkl::transpose::nontrans - : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans - : oneapi::mkl::transpose::conjtrans; + trans[i] = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans + : (std::rand() % 2) == 0 ? oneapi::mkl::transpose::trans + : oneapi::mkl::transpose::conjtrans; } else { trans[i] = (std::rand() % 2) == 0 ? oneapi::mkl::transpose::nontrans @@ -118,8 +117,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector a_array(uafpp), c_array(uafpp), c_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector a_array(uafpp), c_array(uafpp), c_ref_array(uafpp); a_array.resize(total_batch_count); c_array.resize(total_batch_count); c_ref_array.resize(total_batch_count); @@ -138,9 +137,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { default: break; } for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); - c_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); - c_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); + a_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); + c_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); + c_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_c, *dev, cxt); rand_matrix(a_array[idx], layout, trans[i], n[i], k[i], lda[i]); rand_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, n[i], n[i], ldc[i]); copy_matrix(c_array[idx], layout, oneapi::mkl::transpose::nontrans, n[i], n[i], ldc[i], @@ -151,16 +150,16 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { // Call reference SYRK_BATCH. using fp_ref = typename ref_type_info::type; - int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *k_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *ldc_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* n_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* k_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* lda_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* ldc_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* group_size_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - CBLAS_UPLO *upper_lower_ref = - (CBLAS_UPLO *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count); - CBLAS_TRANSPOSE *trans_ref = - (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + CBLAS_UPLO* upper_lower_ref = + (CBLAS_UPLO*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count); + CBLAS_TRANSPOSE* trans_ref = + (CBLAS_TRANSPOSE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); if ((n_ref == NULL) || (k_ref == NULL) || (lda_ref == NULL) || (ldc_ref == NULL) || (trans_ref == NULL) || (upper_lower_ref == NULL) || (group_size_ref == NULL)) { @@ -194,9 +193,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { group_size_ref[i] = (int)group_size[i]; for (j = 0; j < group_size_ref[i]; j++) { ::syrk(convert_to_cblas_layout(layout), upper_lower_ref[i], trans_ref[i], - (const int *)&n_ref[i], (const int *)&k_ref[i], (const fp_ref *)&alpha[i], - (const fp_ref *)a_array[idx], (const int *)&lda_ref[i], (const fp_ref *)&beta[i], - (fp_ref *)c_ref_array[idx], (const int *)&ldc_ref[i]); + (const int*)&n_ref[i], (const int*)&k_ref[i], (const fp_ref*)&alpha[i], + (const fp_ref*)a_array[idx], (const int*)&lda_ref[i], (const fp_ref*)&beta[i], + (fp_ref*)c_ref_array[idx], (const int*)&ldc_ref[i]); idx++; } } @@ -209,13 +208,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::syrk_batch( main_queue, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, + (const fp**)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::syrk_batch( main_queue, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, + (const fp**)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; default: break; @@ -226,13 +225,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::syrk_batch, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], + (const fp**)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::syrk_batch, &upper_lower[0], &trans[0], &n[0], &k[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &beta[0], &c_array[0], + (const fp**)&a_array[0], &lda[0], &beta[0], &c_array[0], &ldc[0], group_count, &group_size[0], dependencies); break; default: break; @@ -240,13 +239,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYRK_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::aligned_free(n_ref); oneapi::mkl::aligned_free(k_ref); oneapi::mkl::aligned_free(lda_ref); @@ -266,7 +265,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYRK_BATCH:\n" << error.what() << std::endl; } @@ -301,7 +300,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class SyrkBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SyrkBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp index cde6aa367..c85e7a885 100644 --- a/tests/unit_tests/blas/batch/trsm_batch_stride.cpp +++ b/tests/unit_tests/blas/batch/trsm_batch_stride.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -116,21 +116,20 @@ int test(device *dev, oneapi::mkl::layout layout) { for (i = 0; i < batch_size_ref; i++) { ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - convert_to_cblas_diag(unit_nonunit), (const int *)&m_ref, (const int *)&n_ref, - (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (fp_ref *)(B_ref.data() + stride_b * i), - (const int *)&ldb_ref); + convert_to_cblas_diag(unit_nonunit), (const int*)&m_ref, (const int*)&n_ref, + (const fp_ref*)&alpha, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (fp_ref*)(B_ref.data() + stride_b * i), (const int*)&ldb_ref); } // Call DPC++ TRSM_BATCH_STRIDE. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -176,17 +175,17 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during TRSM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -201,7 +200,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class TrsmBatchStrideTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(TrsmBatchStrideTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp b/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp index d99836f87..1b518d5bb 100644 --- a/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp +++ b/tests/unit_tests/blas/batch/trsm_batch_stride_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); @@ -137,10 +137,9 @@ int test(device *dev, oneapi::mkl::layout layout) { for (i = 0; i < batch_size_ref; i++) { ::trsm(convert_to_cblas_layout(layout), convert_to_cblas_side(left_right), convert_to_cblas_uplo(upper_lower), convert_to_cblas_trans(trans), - convert_to_cblas_diag(unit_nonunit), (const int *)&m_ref, (const int *)&n_ref, - (const fp_ref *)&alpha, (const fp_ref *)(A.data() + stride_a * i), - (const int *)&lda_ref, (fp_ref *)(B_ref.data() + stride_b * i), - (const int *)&ldb_ref); + convert_to_cblas_diag(unit_nonunit), (const int*)&m_ref, (const int*)&n_ref, + (const fp_ref*)&alpha, (const fp_ref*)(A.data() + stride_a * i), + (const int*)&lda_ref, (fp_ref*)(B_ref.data() + stride_b * i), (const int*)&ldb_ref); } // Call DPC++ TRSM_BATCH_STRIDE. @@ -180,17 +179,17 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during TRSM_BATCH_STRIDE:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of TRSM_BATCH_STRIDE:\n" << error.what() << std::endl; } @@ -203,7 +202,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class TrsmBatchStrideUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(TrsmBatchStrideUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/batch/trsm_batch_usm.cpp b/tests/unit_tests/blas/batch/trsm_batch_usm.cpp index 747f59433..b7ddff8c8 100644 --- a/tests/unit_tests/blas/batch/trsm_batch_usm.cpp +++ b/tests/unit_tests/blas/batch/trsm_batch_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { +int test(device* dev, oneapi::mkl::layout layout, int64_t group_count) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during TRSM_BATCH:\n" << e.what() << std::endl; print_error_code(e); @@ -128,8 +128,8 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { total_batch_count += group_size[i]; } - auto uafpp = usm_allocator(cxt, *dev); - vector a_array(uafpp), b_array(uafpp), b_ref_array(uafpp); + auto uafpp = usm_allocator(cxt, *dev); + vector a_array(uafpp), b_array(uafpp), b_ref_array(uafpp); a_array.resize(total_batch_count); b_array.resize(total_batch_count); @@ -141,9 +141,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { Arank = left_right[i] == oneapi::mkl::side::left ? m[i] : n[i]; size_b = ldb[i] * ((layout == oneapi::mkl::layout::col_major) ? n[i] : m[i]); for (j = 0; j < group_size[i]; j++) { - a_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); - b_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); - b_ref_array[idx] = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); + a_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_a, *dev, cxt); + b_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); + b_ref_array[idx] = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp) * size_b, *dev, cxt); rand_trsm_matrix(a_array[idx], layout, trans[i], Arank, Arank, lda[i]); rand_matrix(b_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldb[i]); copy_matrix(b_array[idx], layout, oneapi::mkl::transpose::nontrans, m[i], n[i], ldb[i], @@ -154,20 +154,20 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { // Call reference TRSM_BATCH. using fp_ref = typename ref_type_info::type; - int *m_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *n_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *lda_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *ldb_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - int *group_size_ref = (int *)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); - - CBLAS_TRANSPOSE *trans_ref = - (CBLAS_TRANSPOSE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); - CBLAS_SIDE *left_right_ref = - (CBLAS_SIDE *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count); - CBLAS_UPLO *upper_lower_ref = - (CBLAS_UPLO *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count); - CBLAS_DIAG *unit_nonunit_ref = - (CBLAS_DIAG *)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_DIAG) * group_count); + int* m_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* n_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* lda_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* ldb_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + int* group_size_ref = (int*)oneapi::mkl::aligned_alloc(64, sizeof(int) * group_count); + + CBLAS_TRANSPOSE* trans_ref = + (CBLAS_TRANSPOSE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_TRANSPOSE) * group_count); + CBLAS_SIDE* left_right_ref = + (CBLAS_SIDE*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_SIDE) * group_count); + CBLAS_UPLO* upper_lower_ref = + (CBLAS_UPLO*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_UPLO) * group_count); + CBLAS_DIAG* unit_nonunit_ref = + (CBLAS_DIAG*)oneapi::mkl::aligned_alloc(64, sizeof(CBLAS_DIAG) * group_count); if ((m_ref == NULL) || (n_ref == NULL) || (lda_ref == NULL) || (ldb_ref == NULL) || (trans_ref == NULL) || (left_right_ref == NULL) || (upper_lower_ref == NULL) || @@ -206,9 +206,9 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { group_size_ref[i] = (int)group_size[i]; for (j = 0; j < group_size_ref[i]; j++) { ::trsm(convert_to_cblas_layout(layout), left_right_ref[i], upper_lower_ref[i], - trans_ref[i], unit_nonunit_ref[i], (const int *)&m_ref[i], - (const int *)&n_ref[i], (const fp_ref *)&alpha[i], (const fp_ref *)a_array[idx], - (const int *)&lda_ref[i], b_ref_array[idx], (const int *)&ldb_ref[i]); + trans_ref[i], unit_nonunit_ref[i], (const int*)&m_ref[i], (const int*)&n_ref[i], + (const fp_ref*)&alpha[i], (const fp_ref*)a_array[idx], (const int*)&lda_ref[i], + b_ref_array[idx], (const int*)&ldb_ref[i]); idx++; } } @@ -221,13 +221,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { case oneapi::mkl::layout::col_major: done = oneapi::mkl::blas::column_major::trsm_batch( main_queue, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0], - &n[0], &alpha[0], (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0], + &n[0], &alpha[0], (const fp**)&a_array[0], &lda[0], &b_array[0], &ldb[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: done = oneapi::mkl::blas::row_major::trsm_batch( main_queue, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0], - &n[0], &alpha[0], (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0], + &n[0], &alpha[0], (const fp**)&a_array[0], &lda[0], &b_array[0], &ldb[0], group_count, &group_size[0], dependencies); break; default: break; @@ -239,14 +239,14 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::column_major::trsm_batch, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0], &n[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0], + (const fp**)&a_array[0], &lda[0], &b_array[0], &ldb[0], group_count, &group_size[0], dependencies); break; case oneapi::mkl::layout::row_major: TEST_RUN_BLAS_CT_SELECT(main_queue, oneapi::mkl::blas::row_major::trsm_batch, &left_right[0], &upper_lower[0], &trans[0], &unit_nonunit[0], &m[0], &n[0], &alpha[0], - (const fp **)&a_array[0], &lda[0], &b_array[0], &ldb[0], + (const fp**)&a_array[0], &lda[0], &b_array[0], &ldb[0], group_count, &group_size[0], dependencies); break; default: break; @@ -254,13 +254,13 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during TRSM_BATCH:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { oneapi::mkl::aligned_free(m_ref); oneapi::mkl::aligned_free(n_ref); oneapi::mkl::aligned_free(lda_ref); @@ -282,7 +282,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of TRSM_BATCH:\n" << error.what() << std::endl; } @@ -319,7 +319,7 @@ int test(device *dev, oneapi::mkl::layout layout, int64_t group_count) { } class TrsmBatchUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(TrsmBatchUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()), 5)); diff --git a/tests/unit_tests/blas/extensions/imatcopy.cpp b/tests/unit_tests/blas/extensions/imatcopy.cpp index e21702775..ba9400817 100644 --- a/tests/unit_tests/blas/extensions/imatcopy.cpp +++ b/tests/unit_tests/blas/extensions/imatcopy.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -95,11 +95,11 @@ int test(device *dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during IMATCOPY:\n" << e.what() << std::endl; print_error_code(e); @@ -138,17 +138,17 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during IMATCOPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of IMATCOPY:\n" << error.what() << std::endl; } @@ -162,7 +162,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class ImatcopyTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(ImatcopyTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/imatcopy_usm.cpp b/tests/unit_tests/blas/extensions/imatcopy_usm.cpp index dc3d43d2e..1acf4ecaf 100644 --- a/tests/unit_tests/blas/extensions/imatcopy_usm.cpp +++ b/tests/unit_tests/blas/extensions/imatcopy_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during IMATCOPY:\n" << e.what() << std::endl; print_error_code(e); @@ -145,17 +145,17 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during IMATCOPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of IMATCOPY:\n" << error.what() << std::endl; } @@ -167,7 +167,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class ImatcopyUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(ImatcopyUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatadd.cpp b/tests/unit_tests/blas/extensions/omatadd.cpp index b2af98935..7e76f74f9 100644 --- a/tests/unit_tests/blas/extensions/omatadd.cpp +++ b/tests/unit_tests/blas/extensions/omatadd.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. int64_t m, n; int64_t lda, ldb, ldc; @@ -106,11 +106,11 @@ int test(device *dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATADD:\n" << e.what() << std::endl; print_error_code(e); @@ -155,16 +155,16 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATADD:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATADD:\n" << error.what() << std::endl; } @@ -178,7 +178,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class OmataddTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmataddTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatadd_usm.cpp b/tests/unit_tests/blas/extensions/omatadd_usm.cpp index 783f985b2..eff40ae8d 100644 --- a/tests/unit_tests/blas/extensions/omatadd_usm.cpp +++ b/tests/unit_tests/blas/extensions/omatadd_usm.cpp @@ -43,19 +43,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATADD:\n" << e.what() << std::endl; print_error_code(e); @@ -161,16 +161,16 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATADD:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATADD:\n" << error.what() << std::endl; } @@ -182,7 +182,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class OmataddUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmataddUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatcopy.cpp b/tests/unit_tests/blas/extensions/omatcopy.cpp index 122ba2c79..1ba35d057 100644 --- a/tests/unit_tests/blas/extensions/omatcopy.cpp +++ b/tests/unit_tests/blas/extensions/omatcopy.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -103,11 +103,11 @@ int test(device *dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY:\n" << e.what() << std::endl; print_error_code(e); @@ -147,17 +147,17 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY:\n" << error.what() << std::endl; } @@ -171,7 +171,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class OmatcopyTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmatcopyTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatcopy2.cpp b/tests/unit_tests/blas/extensions/omatcopy2.cpp index d0407c324..3bc7dfccb 100644 --- a/tests/unit_tests/blas/extensions/omatcopy2.cpp +++ b/tests/unit_tests/blas/extensions/omatcopy2.cpp @@ -43,12 +43,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. int64_t m, n; int64_t lda, ldb; @@ -100,11 +100,11 @@ int test(device *dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY2:\n" << e.what() << std::endl; print_error_code(e); @@ -146,17 +146,17 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY2:\n" << error.what() << std::endl; } @@ -170,7 +170,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class Omatcopy2Tests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Omatcopy2Tests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp b/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp index d2103d243..3dcf87dc1 100644 --- a/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp +++ b/tests/unit_tests/blas/extensions/omatcopy2_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY2:\n" << e.what() << std::endl; print_error_code(e); @@ -157,17 +157,17 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY2:\n" << error.what() << std::endl; } @@ -179,7 +179,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class Omatcopy2UsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Omatcopy2UsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/extensions/omatcopy_usm.cpp b/tests/unit_tests/blas/extensions/omatcopy_usm.cpp index ac9ba2d5c..b217e2f54 100644 --- a/tests/unit_tests/blas/extensions/omatcopy_usm.cpp +++ b/tests/unit_tests/blas/extensions/omatcopy_usm.cpp @@ -44,19 +44,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during OMATCOPY:\n" << e.what() << std::endl; print_error_code(e); @@ -147,17 +147,17 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during OMATCOPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of OMATCOPY:\n" << error.what() << std::endl; } @@ -169,7 +169,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class OmatcopyUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(OmatcopyUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/include/reference_blas_templates.hpp b/tests/unit_tests/blas/include/reference_blas_templates.hpp index 6d184ba75..de7e36d40 100644 --- a/tests/unit_tests/blas/include/reference_blas_templates.hpp +++ b/tests/unit_tests/blas/include/reference_blas_templates.hpp @@ -33,8 +33,8 @@ inline bool isNonTranspose(CBLAS_TRANSPOSE trans) { } template -static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row, - int col, int ld, T_dest *&dest) { +static inline void copy_mat(T_src& src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row, + int col, int ld, T_dest*& dest) { int i, j, Iend, Jend; if (layout == CblasColMajor) { Jend = isNonTranspose(trans) ? col : row; @@ -53,8 +53,8 @@ static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE tra } template -static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row, - int col, int ld, T_dest off, T_dest *&dest) { +static inline void copy_mat(T_src& src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int row, + int col, int ld, T_dest off, T_dest*& dest) { int i, j, Iend, Jend; if (layout == CblasColMajor) { Jend = isNonTranspose(trans) ? col : row; @@ -73,8 +73,8 @@ static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, CBLAS_TRANSPOSE tra } template -static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, int row, int col, int ld, - CBLAS_OFFSET off_kind, T_off off, T_dest &dest) { +static inline void copy_mat(T_src& src, CBLAS_LAYOUT layout, int row, int col, int ld, + CBLAS_OFFSET off_kind, T_off off, T_dest& dest) { using T_data = typename std::remove_reference::type; int i, j; T_data tmp; @@ -110,8 +110,8 @@ static inline void copy_mat(T_src &src, CBLAS_LAYOUT layout, int row, int col, i } template -static inline void update_c(T_src &src, CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, int row, - int col, int ld, T_desc *&dest) { +static inline void update_c(T_src& src, CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, int row, + int col, int ld, T_desc*& dest) { int i, j; int Jend = (layout == CblasColMajor) ? col : row; @@ -139,15 +139,15 @@ static inline void update_c(T_src &src, CBLAS_LAYOUT layout, CBLAS_UPLO upper_lo /* Level 3 */ template -static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const fp *alpha, const fp *a, const int *lda, - const fp *b, const int *ldb, const fp *beta, fp *c, const int *ldc); +static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const fp* alpha, const fp* a, const int* lda, + const fp* b, const int* ldb, const fp* beta, fp* c, const int* ldc); template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const sycl::half *alpha, const sycl::half *a, const int *lda, - const sycl::half *b, const int *ldb, const sycl::half *beta, sycl::half *c, - const int *ldc) { +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const sycl::half* alpha, const sycl::half* a, const int* lda, + const sycl::half* b, const int* ldb, const sycl::half* beta, sycl::half* c, + const int* ldc) { // Not supported in NETLIB. SGEMM is used as reference. int sizea, sizeb, sizec; const float alphaf = *alpha; @@ -162,9 +162,9 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; sizec = *ldc * *m; } - float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); - float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); - float *cf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec); + float* af = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); + float* bf = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); + float* cf = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec); copy_mat(a, layout, transa, *m, *k, *lda, af); copy_mat(b, layout, transb, *k, *n, *ldb, bf); copy_mat(c, layout, CblasNoTrans, *m, *n, *ldc, cf); @@ -177,49 +177,49 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c } template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const float *alpha, const float *a, const int *lda, - const float *b, const int *ldb, const float *beta, float *c, const int *ldc) { +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const float* alpha, const float* a, const int* lda, + const float* b, const int* ldb, const float* beta, float* c, const int* ldc) { cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const double *alpha, const double *a, const int *lda, - const double *b, const int *ldb, const double *beta, double *c, const int *ldc) { +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const double* alpha, const double* a, const int* lda, + const double* b, const int* ldb, const double* beta, double* c, const int* ldc) { cblas_dgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const std::complex *alpha, - const std::complex *a, const int *lda, const std::complex *b, - const int *ldb, const std::complex *beta, std::complex *c, const int *ldc) { - cblas_cgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, + const int* ldb, const std::complex* beta, std::complex* c, const int* ldc) { + cblas_cgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void*)alpha, (const void*)a, + *lda, (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const std::complex *alpha, - const std::complex *a, const int *lda, const std::complex *b, - const int *ldb, const std::complex *beta, std::complex *c, - const int *ldc) { - cblas_zgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, + const int* ldb, const std::complex* beta, std::complex* c, + const int* ldc) { + cblas_zgemm_wrapper(layout, transa, transb, *m, *n, *k, (const void*)alpha, (const void*)a, + *lda, (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template -static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const fpc *alpha, const fpa *a, const int *lda, - const fpa *b, const int *ldb, const fpc *beta, fpc *c, const int *ldc); +static void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const fpc* alpha, const fpa* a, const int* lda, + const fpa* b, const int* ldb, const fpc* beta, fpc* c, const int* ldc); template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const float *alpha, const sycl::half *a, const int *lda, - const sycl::half *b, const int *ldb, const float *beta, float *c, const int *ldc) { +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const float* alpha, const sycl::half* a, const int* lda, + const sycl::half* b, const int* ldb, const float* beta, float* c, const int* ldc) { // Not supported in NETLIB. SGEMM is used as reference. int sizea, sizeb; if (layout == CblasColMajor) { @@ -230,8 +230,8 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k; sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; } - float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); - float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); + float* af = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); + float* bf = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); copy_mat(a, layout, transa, *m, *k, *lda, af); copy_mat(b, layout, transb, *k, *n, *ldb, bf); cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, af, *lda, bf, *ldb, *beta, c, @@ -241,10 +241,10 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c } template <> -void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int *m, - const int *n, const int *k, const float *alpha, const oneapi::mkl::bfloat16 *a, - const int *lda, const oneapi::mkl::bfloat16 *b, const int *ldb, const float *beta, - float *c, const int *ldc) { +void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int* m, + const int* n, const int* k, const float* alpha, const oneapi::mkl::bfloat16* a, + const int* lda, const oneapi::mkl::bfloat16* b, const int* ldb, const float* beta, + float* c, const int* ldc) { // Not supported in NETLIB. SGEMM is used as reference. int sizea, sizeb; if (layout == CblasColMajor) { @@ -255,8 +255,8 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c sizea = (transa == CblasNoTrans) ? *lda * *m : *lda * *k; sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; } - float *af = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); - float *bf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); + float* af = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizea); + float* bf = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizeb); copy_mat(a, layout, transa, *m, *k, *lda, af); copy_mat(b, layout, transb, *k, *n, *ldb, bf); cblas_sgemm_wrapper(layout, transa, transb, *m, *n, *k, *alpha, af, *lda, bf, *ldb, *beta, c, @@ -266,1146 +266,1142 @@ void gemm(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, c } template -static void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, - const int *n, const fp *alpha, const fp *a, const int *lda, const fp *b, - const int *ldb, const fp *beta, fp *c, const int *ldc); +static void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, + const int* n, const fp* alpha, const fp* a, const int* lda, const fp* b, + const int* ldb, const fp* beta, fp* c, const int* ldc); template <> -void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const float *alpha, const float *a, const int *lda, const float *b, const int *ldb, - const float *beta, float *c, const int *ldc) { +void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, + const float* beta, float* c, const int* ldc) { cblas_ssymm_wrapper(layout, left_right, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, - const double *beta, double *c, const int *ldc) { +void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const double* alpha, const double* a, const int* lda, const double* b, const int* ldb, + const double* beta, double* c, const int* ldc) { cblas_dsymm_wrapper(layout, left_right, uplo, *m, *n, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_csymm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_csymm_wrapper(layout, left_right, uplo, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template <> -void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_zsymm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void symm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_zsymm_wrapper(layout, left_right, uplo, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template -static void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, - const int *k, const fp *alpha, const fp *a, const int *lda, const fp *beta, fp *c, - const int *ldc); +static void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, + const int* k, const fp* alpha, const fp* a, const int* lda, const fp* beta, fp* c, + const int* ldc); template <> -void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const float *alpha, const float *a, const int *lda, const float *beta, float *c, - const int *ldc) { +void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const float* alpha, const float* a, const int* lda, const float* beta, float* c, + const int* ldc) { cblas_ssyrk_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc); } template <> -void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const double *alpha, const double *a, const int *lda, const double *beta, double *c, - const int *ldc) { +void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const double* alpha, const double* a, const int* lda, const double* beta, double* c, + const int* ldc) { cblas_dsyrk_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, *beta, c, *ldc); } template <> -void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *beta, std::complex *c, const int *ldc) { - cblas_csyrk_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)beta, (void *)c, *ldc); +void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* beta, std::complex* c, const int* ldc) { + cblas_csyrk_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)beta, (void*)c, *ldc); } template <> -void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *beta, std::complex *c, const int *ldc) { - cblas_zsyrk_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)beta, (void *)c, *ldc); +void syrk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* beta, std::complex* c, const int* ldc) { + cblas_zsyrk_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)beta, (void*)c, *ldc); } template -static void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, - const int *n, const fp *alpha, const fp *a, const int *lda, const fp *b, - const int *ldb, const fp *beta, fp *c, const int *ldc); +static void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, + const int* n, const fp* alpha, const fp* a, const int* lda, const fp* b, + const int* ldb, const fp* beta, fp* c, const int* ldc); template <> -void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_chemm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_chemm_wrapper(layout, left_right, uplo, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template <> -void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_zhemm_wrapper(layout, left_right, uplo, *m, *n, (const void *)alpha, (const void *)a, - *lda, (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void hemm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_zhemm_wrapper(layout, left_right, uplo, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template -static void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, - const int *k, const fp_scalar *alpha, const fp_data *a, const int *lda, - const fp_scalar *beta, fp_data *c, const int *ldc); +static void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, + const int* k, const fp_scalar* alpha, const fp_data* a, const int* lda, + const fp_scalar* beta, fp_data* c, const int* ldc); template <> -void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const float *alpha, const std::complex *a, const int *lda, const float *beta, - std::complex *c, const int *ldc) { - cblas_cherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void *)a, *lda, *beta, - (void *)c, *ldc); +void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const float* alpha, const std::complex* a, const int* lda, const float* beta, + std::complex* c, const int* ldc) { + cblas_cherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void*)a, *lda, *beta, (void*)c, + *ldc); } template <> -void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const double *alpha, const std::complex *a, const int *lda, const double *beta, - std::complex *c, const int *ldc) { - cblas_zherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void *)a, *lda, *beta, - (void *)c, *ldc); +void herk(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const double* alpha, const std::complex* a, const int* lda, const double* beta, + std::complex* c, const int* ldc) { + cblas_zherk_wrapper(layout, uplo, trans, *n, *k, *alpha, (const void*)a, *lda, *beta, (void*)c, + *ldc); } template -static void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, - const int *k, const fp *alpha, const fp *a, const int *lda, const fp *b, - const int *ldb, const fp *beta, fp *c, const int *ldc); +static void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, + const int* k, const fp* alpha, const fp* a, const int* lda, const fp* b, + const int* ldb, const fp* beta, fp* c, const int* ldc); template <> -void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const float *alpha, const float *a, const int *lda, const float *b, const int *ldb, - const float *beta, float *c, const int *ldc) { +void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const float* alpha, const float* a, const int* lda, const float* b, const int* ldb, + const float* beta, float* c, const int* ldc) { cblas_ssyr2k_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, - const double *beta, double *c, const int *ldc) { +void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const double* alpha, const double* a, const int* lda, const double* b, const int* ldb, + const double* beta, double* c, const int* ldc) { cblas_dsyr2k_wrapper(layout, uplo, trans, *n, *k, *alpha, a, *lda, b, *ldb, *beta, c, *ldc); } template <> -void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_csyr2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_csyr2k_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template <> -void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const std::complex *beta, - std::complex *c, const int *ldc) { - cblas_zsyr2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)b, *ldb, (const void *)beta, (void *)c, *ldc); +void syr2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const std::complex* beta, + std::complex* c, const int* ldc) { + cblas_zsyr2k_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, (const void*)beta, (void*)c, *ldc); } template -static void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, - const int *k, const fp_data *alpha, const fp_data *a, const int *lda, - const fp_data *b, const int *ldb, const fp_scalar *beta, fp_data *c, - const int *ldc); +static void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, + const int* k, const fp_data* alpha, const fp_data* a, const int* lda, + const fp_data* b, const int* ldb, const fp_scalar* beta, fp_data* c, + const int* ldc); template <> -void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const float *beta, std::complex *c, - const int *ldc) { - cblas_cher2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)b, *ldb, *beta, (void *)c, *ldc); +void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const float* beta, std::complex* c, + const int* ldc) { + cblas_cher2k_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, *beta, (void*)c, *ldc); } template <> -void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *b, const int *ldb, const double *beta, - std::complex *c, const int *ldc) { - cblas_zher2k_wrapper(layout, uplo, trans, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)b, *ldb, *beta, (void *)c, *ldc); +void her2k(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* b, const int* ldb, const double* beta, + std::complex* c, const int* ldc) { + cblas_zher2k_wrapper(layout, uplo, trans, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)b, *ldb, *beta, (void*)c, *ldc); } template static void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const fp *alpha, const fp *a, - const int *lda, fp *b, const int *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const fp* alpha, const fp* a, + const int* lda, fp* b, const int* ldb); template <> void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const float *alpha, const float *a, - const int *lda, float *b, const int *ldb) { + CBLAS_DIAG diag, const int* m, const int* n, const float* alpha, const float* a, + const int* lda, float* b, const int* ldb) { cblas_strmm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb); } template <> void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const double *alpha, const double *a, - const int *lda, double *b, const int *ldb) { + CBLAS_DIAG diag, const int* m, const int* n, const double* alpha, const double* a, + const int* lda, double* b, const int* ldb) { cblas_dtrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb); } template <> void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const std::complex *alpha, - const std::complex *a, const int *lda, std::complex *b, const int *ldb) { - cblas_ctrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha, - (const void *)a, *lda, (void *)b, *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const std::complex* alpha, + const std::complex* a, const int* lda, std::complex* b, const int* ldb) { + cblas_ctrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void*)alpha, + (const void*)a, *lda, (void*)b, *ldb); } template <> void trmm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const std::complex *alpha, - const std::complex *a, const int *lda, std::complex *b, const int *ldb) { - cblas_ztrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha, - (const void *)a, *lda, (void *)b, *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const std::complex* alpha, + const std::complex* a, const int* lda, std::complex* b, const int* ldb) { + cblas_ztrmm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void*)alpha, + (const void*)a, *lda, (void*)b, *ldb); } template static void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const fp *alpha, const fp *a, - const int *lda, fp *b, const int *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const fp* alpha, const fp* a, + const int* lda, fp* b, const int* ldb); template <> void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const float *alpha, const float *a, - const int *lda, float *b, const int *ldb) { + CBLAS_DIAG diag, const int* m, const int* n, const float* alpha, const float* a, + const int* lda, float* b, const int* ldb) { cblas_strsm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb); } template <> void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const double *alpha, const double *a, - const int *lda, double *b, const int *ldb) { + CBLAS_DIAG diag, const int* m, const int* n, const double* alpha, const double* a, + const int* lda, double* b, const int* ldb) { cblas_dtrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, *alpha, a, *lda, b, *ldb); } template <> void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const std::complex *alpha, - const std::complex *a, const int *lda, std::complex *b, const int *ldb) { - cblas_ctrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha, - (const void *)a, *lda, (void *)b, *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const std::complex* alpha, + const std::complex* a, const int* lda, std::complex* b, const int* ldb) { + cblas_ctrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void*)alpha, + (const void*)a, *lda, (void*)b, *ldb); } template <> void trsm(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int *m, const int *n, const std::complex *alpha, - const std::complex *a, const int *lda, std::complex *b, const int *ldb) { - cblas_ztrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void *)alpha, - (const void *)a, *lda, (void *)b, *ldb); + CBLAS_DIAG diag, const int* m, const int* n, const std::complex* alpha, + const std::complex* a, const int* lda, std::complex* b, const int* ldb) { + cblas_ztrsm_wrapper(layout, side, uplo, transa, diag, *m, *n, (const void*)alpha, + (const void*)a, *lda, (void*)b, *ldb); } /* Level 2 */ template -static void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, - const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx, - const fp *beta, fp *y, const int *incy); +static void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, + const fp* alpha, const fp* a, const int* lda, const fp* x, const int* incx, + const fp* beta, fp* y, const int* incy); template <> -void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, - const float *alpha, const float *a, const int *lda, const float *x, const int *incx, - const float *beta, float *y, const int *incy) { +void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, + const float* alpha, const float* a, const int* lda, const float* x, const int* incx, + const float* beta, float* y, const int* incy) { cblas_sgemv_wrapper(layout, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, - const double *alpha, const double *a, const int *lda, const double *x, const int *incx, - const double *beta, double *y, const int *incy) { +void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, + const double* alpha, const double* a, const int* lda, const double* x, const int* incx, + const double* beta, double* y, const int* incy) { cblas_dgemv_wrapper(layout, trans, *m, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_cgemv_wrapper(layout, trans, *m, *n, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_cgemv_wrapper(layout, trans, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template <> -void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_zgemv_wrapper(layout, trans, *m, *n, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void gemv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_zgemv_wrapper(layout, trans, *m, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template -static void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, - int *ku, const fp *alpha, const fp *a, const int *lda, const fp *x, - const int *incx, const fp *beta, fp *y, const int *incy); +static void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, int* kl, + int* ku, const fp* alpha, const fp* a, const int* lda, const fp* x, + const int* incx, const fp* beta, fp* y, const int* incy); template <> -void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku, - const float *alpha, const float *a, const int *lda, const float *x, const int *incx, - const float *beta, float *y, const int *incy) { +void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, int* kl, int* ku, + const float* alpha, const float* a, const int* lda, const float* x, const int* incx, + const float* beta, float* y, const int* incy) { cblas_sgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku, - const double *alpha, const double *a, const int *lda, const double *x, const int *incx, - const double *beta, double *y, const int *incy) { +void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, int* kl, int* ku, + const double* alpha, const double* a, const int* lda, const double* x, const int* incx, + const double* beta, double* y, const int* incy) { cblas_dgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_cgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, int* kl, int* ku, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_cgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template <> -void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int *m, const int *n, int *kl, int *ku, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_zgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void gbmv(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int* m, const int* n, int* kl, int* ku, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_zgbmv_wrapper(layout, trans, *m, *n, *kl, *ku, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template -static void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x, - const int *incx, const fp *y, const int *incy, fp *a, const int *lda); +static void ger(CBLAS_LAYOUT layout, const int* m, const int* n, const fp* alpha, const fp* x, + const int* incx, const fp* y, const int* incy, fp* a, const int* lda); template <> -void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const float *alpha, const float *x, - const int *incx, const float *y, const int *incy, float *a, const int *lda) { +void ger(CBLAS_LAYOUT layout, const int* m, const int* n, const float* alpha, const float* x, + const int* incx, const float* y, const int* incy, float* a, const int* lda) { cblas_sger_wrapper(layout, *m, *n, *alpha, x, *incx, y, *incy, a, *lda); } template <> -void ger(CBLAS_LAYOUT layout, const int *m, const int *n, const double *alpha, const double *x, - const int *incx, const double *y, const int *incy, double *a, const int *lda) { +void ger(CBLAS_LAYOUT layout, const int* m, const int* n, const double* alpha, const double* x, + const int* incx, const double* y, const int* incy, double* a, const int* lda) { cblas_dger_wrapper(layout, *m, *n, *alpha, x, *incx, y, *incy, a, *lda); } template -static void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x, - const int *incx, const fp *y, const int *incy, fp *a, const int *lda); +static void gerc(CBLAS_LAYOUT layout, const int* m, const int* n, const fp* alpha, const fp* x, + const int* incx, const fp* y, const int* incy, fp* a, const int* lda); template <> -void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex *alpha, - const std::complex *x, const int *incx, const std::complex *y, - const int *incy, std::complex *a, const int *lda) { - cblas_cgerc_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void gerc(CBLAS_LAYOUT layout, const int* m, const int* n, const std::complex* alpha, + const std::complex* x, const int* incx, const std::complex* y, + const int* incy, std::complex* a, const int* lda) { + cblas_cgerc_wrapper(layout, *m, *n, (const void*)alpha, (const void*)x, *incx, (const void*)y, + *incy, (void*)a, *lda); } template <> -void gerc(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex *alpha, - const std::complex *x, const int *incx, const std::complex *y, - const int *incy, std::complex *a, const int *lda) { - cblas_zgerc_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void gerc(CBLAS_LAYOUT layout, const int* m, const int* n, const std::complex* alpha, + const std::complex* x, const int* incx, const std::complex* y, + const int* incy, std::complex* a, const int* lda) { + cblas_zgerc_wrapper(layout, *m, *n, (const void*)alpha, (const void*)x, *incx, (const void*)y, + *incy, (void*)a, *lda); } template -static void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const fp *alpha, const fp *x, - const int *incx, const fp *y, const int *incy, fp *a, const int *lda); +static void geru(CBLAS_LAYOUT layout, const int* m, const int* n, const fp* alpha, const fp* x, + const int* incx, const fp* y, const int* incy, fp* a, const int* lda); template <> -void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex *alpha, - const std::complex *x, const int *incx, const std::complex *y, - const int *incy, std::complex *a, const int *lda) { - cblas_cgeru_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void geru(CBLAS_LAYOUT layout, const int* m, const int* n, const std::complex* alpha, + const std::complex* x, const int* incx, const std::complex* y, + const int* incy, std::complex* a, const int* lda) { + cblas_cgeru_wrapper(layout, *m, *n, (const void*)alpha, (const void*)x, *incx, (const void*)y, + *incy, (void*)a, *lda); } template <> -void geru(CBLAS_LAYOUT layout, const int *m, const int *n, const std::complex *alpha, - const std::complex *x, const int *incx, const std::complex *y, - const int *incy, std::complex *a, const int *lda) { - cblas_zgeru_wrapper(layout, *m, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void geru(CBLAS_LAYOUT layout, const int* m, const int* n, const std::complex* alpha, + const std::complex* x, const int* incx, const std::complex* y, + const int* incy, std::complex* a, const int* lda) { + cblas_zgeru_wrapper(layout, *m, *n, (const void*)alpha, (const void*)x, *incx, (const void*)y, + *incy, (void*)a, *lda); } template -static void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx, - const fp *beta, fp *y, const int *incy); +static void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const fp* alpha, const fp* a, const int* lda, const fp* x, const int* incx, + const fp* beta, fp* y, const int* incy); template <> -void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_chbmv_wrapper(layout, upper_lower, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_chbmv_wrapper(layout, upper_lower, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template <> -void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_zhbmv_wrapper(layout, upper_lower, *n, *k, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_zhbmv_wrapper(layout, upper_lower, *n, *k, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template -static void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *a, const int *lda, const fp *x, const int *incx, const fp *beta, fp *y, - const int *incy); +static void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* a, const int* lda, const fp* x, const int* incx, const fp* beta, fp* y, + const int* incy); template <> -void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_chemv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_chemv_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template <> -void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *a, const int *lda, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_zhemv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, *lda, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hemv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* a, const int* lda, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_zhemv_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)a, *lda, + (const void*)x, *incx, (const void*)beta, (void*)y, *incy); } template -static void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp_scalar *alpha, - const fp_data *x, const int *incx, fp_data *a, const int *lda); +static void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp_scalar* alpha, + const fp_data* x, const int* incx, fp_data* a, const int* lda); template <> -void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const std::complex *x, const int *incx, std::complex *a, const int *lda) { - cblas_cher_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a, *lda); +void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const std::complex* x, const int* incx, std::complex* a, const int* lda) { + cblas_cher_wrapper(layout, upper_lower, *n, *alpha, (const void*)x, *incx, (void*)a, *lda); } template <> -void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const std::complex *x, const int *incx, std::complex *a, const int *lda) { - cblas_zher_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a, *lda); +void her(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const std::complex* x, const int* incx, std::complex* a, const int* lda) { + cblas_zher_wrapper(layout, upper_lower, *n, *alpha, (const void*)x, *incx, (void*)a, *lda); } template -static void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, const fp *y, const int *incy, fp *a, const int *lda); +static void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, const fp* y, const int* incy, fp* a, const int* lda); template <> -void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *x, const int *incx, - const std::complex *y, const int *incy, std::complex *a, const int *lda) { - cblas_cher2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* x, const int* incx, + const std::complex* y, const int* incy, std::complex* a, const int* lda) { + cblas_cher2_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)x, *incx, + (const void*)y, *incy, (void*)a, *lda); } template <> -void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *x, const int *incx, - const std::complex *y, const int *incy, std::complex *a, const int *lda) { - cblas_zher2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a, *lda); +void her2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* x, const int* incx, + const std::complex* y, const int* incy, std::complex* a, const int* lda) { + cblas_zher2_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)x, *incx, + (const void*)y, *incy, (void*)a, *lda); } template -static void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *a, const fp *x, const int *incx, const fp *beta, fp *y, const int *incy); +static void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* a, const fp* x, const int* incx, const fp* beta, fp* y, const int* incy); template <> -void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *a, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_chpmv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* a, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_chpmv_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)a, (const void*)x, + *incx, (const void*)beta, (void*)y, *incy); } template <> -void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *a, - const std::complex *x, const int *incx, const std::complex *beta, - std::complex *y, const int *incy) { - cblas_zhpmv_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)a, - (const void *)x, *incx, (const void *)beta, (void *)y, *incy); +void hpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* a, + const std::complex* x, const int* incx, const std::complex* beta, + std::complex* y, const int* incy) { + cblas_zhpmv_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)a, (const void*)x, + *incx, (const void*)beta, (void*)y, *incy); } template -static void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp_scalar *alpha, - const fp_data *x, const int *incx, fp_data *a); +static void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp_scalar* alpha, + const fp_data* x, const int* incx, fp_data* a); template <> -void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const std::complex *x, const int *incx, std::complex *a) { - cblas_chpr_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a); +void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const std::complex* x, const int* incx, std::complex* a) { + cblas_chpr_wrapper(layout, upper_lower, *n, *alpha, (const void*)x, *incx, (void*)a); } template <> -void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const std::complex *x, const int *incx, std::complex *a) { - cblas_zhpr_wrapper(layout, upper_lower, *n, *alpha, (const void *)x, *incx, (void *)a); +void hpr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const std::complex* x, const int* incx, std::complex* a) { + cblas_zhpr_wrapper(layout, upper_lower, *n, *alpha, (const void*)x, *incx, (void*)a); } template -static void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, const fp *y, const int *incy, fp *a); +static void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, const fp* y, const int* incy, fp* a); template <> -void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *x, const int *incx, - const std::complex *y, const int *incy, std::complex *a) { - cblas_chpr2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a); +void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* x, const int* incx, + const std::complex* y, const int* incy, std::complex* a) { + cblas_chpr2_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)x, *incx, + (const void*)y, *incy, (void*)a); } template <> -void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, - const std::complex *alpha, const std::complex *x, const int *incx, - const std::complex *y, const int *incy, std::complex *a) { - cblas_zhpr2_wrapper(layout, upper_lower, *n, (const void *)alpha, (const void *)x, *incx, - (const void *)y, *incy, (void *)a); +void hpr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, + const std::complex* alpha, const std::complex* x, const int* incx, + const std::complex* y, const int* incy, std::complex* a) { + cblas_zhpr2_wrapper(layout, upper_lower, *n, (const void*)alpha, (const void*)x, *incx, + (const void*)y, *incy, (void*)a); } template -static void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const fp *alpha, const fp *a, const int *lda, const fp *x, const int *incx, - const fp *beta, fp *y, const int *incy); +static void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const fp* alpha, const fp* a, const int* lda, const fp* x, const int* incx, + const fp* beta, fp* y, const int* incy); template <> -void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const float *alpha, const float *a, const int *lda, const float *x, const int *incx, - const float *beta, float *y, const int *incy) { +void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const float* alpha, const float* a, const int* lda, const float* x, const int* incx, + const float* beta, float* y, const int* incy) { cblas_ssbmv_wrapper(layout, upper_lower, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const int *k, - const double *alpha, const double *a, const int *lda, const double *x, const int *incx, - const double *beta, double *y, const int *incy) { +void sbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const int* k, + const double* alpha, const double* a, const int* lda, const double* x, const int* incx, + const double* beta, double* y, const int* incy) { cblas_dsbmv_wrapper(layout, upper_lower, *n, *k, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template -static void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *a, const int *lda, const fp *x, const int *incx, const fp *beta, fp *y, - const int *incy); +static void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* a, const int* lda, const fp* x, const int* incx, const fp* beta, fp* y, + const int* incy); template <> -void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *a, const int *lda, const float *x, const int *incx, const float *beta, - float *y, const int *incy) { +void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* a, const int* lda, const float* x, const int* incx, const float* beta, + float* y, const int* incy) { cblas_ssymv_wrapper(layout, upper_lower, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template <> -void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *a, const int *lda, const double *x, const int *incx, const double *beta, - double *y, const int *incy) { +void symv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* a, const int* lda, const double* x, const int* incx, const double* beta, + double* y, const int* incy) { cblas_dsymv_wrapper(layout, upper_lower, *n, *alpha, a, *lda, x, *incx, *beta, y, *incy); } template -static void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, fp *a, const int *lda); +static void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, fp* a, const int* lda); template <> -void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *x, const int *incx, float *a, const int *lda) { +void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* x, const int* incx, float* a, const int* lda) { cblas_ssyr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a, *lda); } template <> -void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *x, const int *incx, double *a, const int *lda) { +void syr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* x, const int* incx, double* a, const int* lda) { cblas_dsyr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a, *lda); } template -static void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, const fp *y, const int *incy, fp *a, const int *lda); +static void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, const fp* y, const int* incy, fp* a, const int* lda); template <> -void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *x, const int *incx, const float *y, const int *incy, float *a, - const int *lda) { +void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* x, const int* incx, const float* y, const int* incy, float* a, + const int* lda) { cblas_ssyr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a, *lda); } template <> -void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *x, const int *incx, const double *y, const int *incy, double *a, - const int *lda) { +void syr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* x, const int* incx, const double* y, const int* incy, double* a, + const int* lda) { cblas_dsyr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a, *lda); } template -static void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *a, const fp *x, const int *incx, const fp *beta, fp *y, const int *incy); +static void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* a, const fp* x, const int* incx, const fp* beta, fp* y, const int* incy); template <> -void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *a, const float *x, const int *incx, const float *beta, float *y, - const int *incy) { +void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* a, const float* x, const int* incx, const float* beta, float* y, + const int* incy) { cblas_sspmv_wrapper(layout, upper_lower, *n, *alpha, a, x, *incx, *beta, y, *incy); } template <> -void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *a, const double *x, const int *incx, const double *beta, double *y, - const int *incy) { +void spmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* a, const double* x, const int* incx, const double* beta, double* y, + const int* incy) { cblas_dspmv_wrapper(layout, upper_lower, *n, *alpha, a, x, *incx, *beta, y, *incy); } template -static void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, fp *a); +static void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, fp* a); template <> -void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *x, const int *incx, float *a) { +void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* x, const int* incx, float* a) { cblas_sspr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a); } template <> -void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *x, const int *incx, double *a) { +void spr(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* x, const int* incx, double* a) { cblas_dspr_wrapper(layout, upper_lower, *n, *alpha, x, *incx, a); } template -static void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const fp *alpha, - const fp *x, const int *incx, const fp *y, const int *incy, fp *a); +static void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const fp* alpha, + const fp* x, const int* incx, const fp* y, const int* incy, fp* a); template <> -void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const float *alpha, - const float *x, const int *incx, const float *y, const int *incy, float *a) { +void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const float* alpha, + const float* x, const int* incx, const float* y, const int* incy, float* a) { cblas_sspr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a); } template <> -void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int *n, const double *alpha, - const double *x, const int *incx, const double *y, const int *incy, double *a) { +void spr2(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int* n, const double* alpha, + const double* x, const int* incx, const double* y, const int* incy, double* a) { cblas_dspr2_wrapper(layout, upper_lower, *n, *alpha, x, *incx, y, *incy, a); } template static void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const int *k, const fp *a, const int *lda, - fp *x, const int *incx); + CBLAS_DIAG unit_diag, const int* n, const int* k, const fp* a, const int* lda, + fp* x, const int* incx); template <> void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx) { + const int* n, const int* k, const float* a, const int* lda, float* x, const int* incx) { cblas_stbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx); } template <> void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx) { + const int* n, const int* k, const double* a, const int* lda, double* x, const int* incx) { cblas_dtbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx); } template <> void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const std::complex *a, const int *lda, - std::complex *x, const int *incx) { - cblas_ctbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda, - (void *)x, *incx); + const int* n, const int* k, const std::complex* a, const int* lda, + std::complex* x, const int* incx) { + cblas_ctbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void*)a, *lda, + (void*)x, *incx); } template <> void tbmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const std::complex *a, const int *lda, - std::complex *x, const int *incx) { - cblas_ztbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda, - (void *)x, *incx); + const int* n, const int* k, const std::complex* a, const int* lda, + std::complex* x, const int* incx) { + cblas_ztbmv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void*)a, *lda, + (void*)x, *incx); } template static void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const int *k, const fp *a, const int *lda, - fp *x, const int *incx); + CBLAS_DIAG unit_diag, const int* n, const int* k, const fp* a, const int* lda, + fp* x, const int* incx); template <> void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx) { + const int* n, const int* k, const float* a, const int* lda, float* x, const int* incx) { cblas_stbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx); } template <> void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx) { + const int* n, const int* k, const double* a, const int* lda, double* x, const int* incx) { cblas_dtbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, a, *lda, x, *incx); } template <> void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const std::complex *a, const int *lda, - std::complex *x, const int *incx) { - cblas_ctbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda, - (void *)x, *incx); + const int* n, const int* k, const std::complex* a, const int* lda, + std::complex* x, const int* incx) { + cblas_ctbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void*)a, *lda, + (void*)x, *incx); } template <> void tbsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const int *k, const std::complex *a, const int *lda, - std::complex *x, const int *incx) { - cblas_ztbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void *)a, *lda, - (void *)x, *incx); + const int* n, const int* k, const std::complex* a, const int* lda, + std::complex* x, const int* incx) { + cblas_ztbsv_wrapper(layout, upper_lower, trans, unit_diag, *n, *k, (const void*)a, *lda, + (void*)x, *incx); } template static void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const fp *a, fp *x, const int *incx); + CBLAS_DIAG unit_diag, const int* n, const fp* a, fp* x, const int* incx); template <> void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const float *a, float *x, const int *incx) { + const int* n, const float* a, float* x, const int* incx) { cblas_stpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx); } template <> void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const double *a, double *x, const int *incx) { + const int* n, const double* a, double* x, const int* incx) { cblas_dtpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx); } template <> void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, std::complex *x, const int *incx) { - cblas_ctpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x, - *incx); + const int* n, const std::complex* a, std::complex* x, const int* incx) { + cblas_ctpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, (void*)x, *incx); } template <> void tpmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, std::complex *x, const int *incx) { - cblas_ztpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x, - *incx); + const int* n, const std::complex* a, std::complex* x, const int* incx) { + cblas_ztpmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, (void*)x, *incx); } template static void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const fp *a, fp *x, const int *incx); + CBLAS_DIAG unit_diag, const int* n, const fp* a, fp* x, const int* incx); template <> void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const float *a, float *x, const int *incx) { + const int* n, const float* a, float* x, const int* incx) { cblas_stpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx); } template <> void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const double *a, double *x, const int *incx) { + const int* n, const double* a, double* x, const int* incx) { cblas_dtpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, x, *incx); } template <> void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, std::complex *x, const int *incx) { - cblas_ctpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x, - *incx); + const int* n, const std::complex* a, std::complex* x, const int* incx) { + cblas_ctpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, (void*)x, *incx); } template <> void tpsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, std::complex *x, const int *incx) { - cblas_ztpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, (void *)x, - *incx); + const int* n, const std::complex* a, std::complex* x, const int* incx) { + cblas_ztpsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, (void*)x, *incx); } template static void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const fp *a, const int *lda, fp *x, - const int *incx); + CBLAS_DIAG unit_diag, const int* n, const fp* a, const int* lda, fp* x, + const int* incx); template <> void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const float *a, const int *lda, float *x, const int *incx) { + const int* n, const float* a, const int* lda, float* x, const int* incx) { cblas_strmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx); } template <> void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const double *a, const int *lda, double *x, const int *incx) { + const int* n, const double* a, const int* lda, double* x, const int* incx) { cblas_dtrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx); } template <> void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, const int *lda, std::complex *x, - const int *incx) { - cblas_ctrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x, + const int* n, const std::complex* a, const int* lda, std::complex* x, + const int* incx) { + cblas_ctrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, *lda, (void*)x, *incx); } template <> void trmv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, const int *lda, std::complex *x, - const int *incx) { - cblas_ztrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x, + const int* n, const std::complex* a, const int* lda, std::complex* x, + const int* incx) { + cblas_ztrmv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, *lda, (void*)x, *incx); } template static void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int *n, const fp *a, const int *lda, fp *x, - const int *incx); + CBLAS_DIAG unit_diag, const int* n, const fp* a, const int* lda, fp* x, + const int* incx); template <> void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const float *a, const int *lda, float *x, const int *incx) { + const int* n, const float* a, const int* lda, float* x, const int* incx) { cblas_strsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx); } template <> void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const double *a, const int *lda, double *x, const int *incx) { + const int* n, const double* a, const int* lda, double* x, const int* incx) { cblas_dtrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, a, *lda, x, *incx); } template <> void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, const int *lda, std::complex *x, - const int *incx) { - cblas_ctrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x, + const int* n, const std::complex* a, const int* lda, std::complex* x, + const int* incx) { + cblas_ctrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, *lda, (void*)x, *incx); } template <> void trsv(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, - const int *n, const std::complex *a, const int *lda, std::complex *x, - const int *incx) { - cblas_ztrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void *)a, *lda, (void *)x, + const int* n, const std::complex* a, const int* lda, std::complex* x, + const int* incx) { + cblas_ztrsv_wrapper(layout, upper_lower, trans, unit_diag, *n, (const void*)a, *lda, (void*)x, *incx); } /* Level 1 */ template -static fp_res asum(const int *n, const fp_data *x, const int *incx); +static fp_res asum(const int* n, const fp_data* x, const int* incx); template <> -float asum(const int *n, const float *x, const int *incx) { +float asum(const int* n, const float* x, const int* incx) { return cblas_sasum_wrapper(*n, x, *incx); } template <> -double asum(const int *n, const double *x, const int *incx) { +double asum(const int* n, const double* x, const int* incx) { return cblas_dasum_wrapper(*n, x, *incx); } template <> -float asum(const int *n, const std::complex *x, const int *incx) { - return cblas_scasum_wrapper(*n, (const void *)x, *incx); +float asum(const int* n, const std::complex* x, const int* incx) { + return cblas_scasum_wrapper(*n, (const void*)x, *incx); } template <> -double asum(const int *n, const std::complex *x, const int *incx) { - return cblas_dzasum_wrapper(*n, (const void *)x, *incx); +double asum(const int* n, const std::complex* x, const int* incx) { + return cblas_dzasum_wrapper(*n, (const void*)x, *incx); } template -static void axpy(const int *n, const fp *alpha, const fp *x, const int *incx, fp *y, - const int *incy); +static void axpy(const int* n, const fp* alpha, const fp* x, const int* incx, fp* y, + const int* incy); template <> -void axpy(const int *n, const float *alpha, const float *x, const int *incx, float *y, - const int *incy) { +void axpy(const int* n, const float* alpha, const float* x, const int* incx, float* y, + const int* incy) { cblas_saxpy_wrapper(*n, *alpha, x, *incx, y, *incy); } template <> -void axpy(const int *n, const double *alpha, const double *x, const int *incx, double *y, - const int *incy) { +void axpy(const int* n, const double* alpha, const double* x, const int* incx, double* y, + const int* incy) { cblas_daxpy_wrapper(*n, *alpha, x, *incx, y, *incy); } template <> -void axpy(const int *n, const std::complex *alpha, const std::complex *x, - const int *incx, std::complex *y, const int *incy) { - cblas_caxpy_wrapper(*n, (const void *)alpha, (const void *)x, *incx, (void *)y, *incy); +void axpy(const int* n, const std::complex* alpha, const std::complex* x, + const int* incx, std::complex* y, const int* incy) { + cblas_caxpy_wrapper(*n, (const void*)alpha, (const void*)x, *incx, (void*)y, *incy); } template <> -void axpy(const int *n, const std::complex *alpha, const std::complex *x, - const int *incx, std::complex *y, const int *incy) { - cblas_zaxpy_wrapper(*n, (const void *)alpha, (const void *)x, *incx, (void *)y, *incy); +void axpy(const int* n, const std::complex* alpha, const std::complex* x, + const int* incx, std::complex* y, const int* incy) { + cblas_zaxpy_wrapper(*n, (const void*)alpha, (const void*)x, *incx, (void*)y, *incy); } template -static void copy(const int *n, const fp *x, const int *incx, fp *y, const int *incy); +static void copy(const int* n, const fp* x, const int* incx, fp* y, const int* incy); template <> -void copy(const int *n, const float *x, const int *incx, float *y, const int *incy) { +void copy(const int* n, const float* x, const int* incx, float* y, const int* incy) { cblas_scopy_wrapper(*n, x, *incx, y, *incy); } template <> -void copy(const int *n, const double *x, const int *incx, double *y, const int *incy) { +void copy(const int* n, const double* x, const int* incx, double* y, const int* incy) { cblas_dcopy_wrapper(*n, x, *incx, y, *incy); } template <> -void copy(const int *n, const std::complex *x, const int *incx, std::complex *y, - const int *incy) { - cblas_ccopy_wrapper(*n, (const void *)x, *incx, (void *)y, *incy); +void copy(const int* n, const std::complex* x, const int* incx, std::complex* y, + const int* incy) { + cblas_ccopy_wrapper(*n, (const void*)x, *incx, (void*)y, *incy); } template <> -void copy(const int *n, const std::complex *x, const int *incx, std::complex *y, - const int *incy) { - cblas_zcopy_wrapper(*n, (const void *)x, *incx, (void *)y, *incy); +void copy(const int* n, const std::complex* x, const int* incx, std::complex* y, + const int* incy) { + cblas_zcopy_wrapper(*n, (const void*)x, *incx, (void*)y, *incy); } template -static fp_res dot(const int *n, const fp *x, const int *incx, const fp *y, const int *incy); +static fp_res dot(const int* n, const fp* x, const int* incx, const fp* y, const int* incy); template <> -float dot(const int *n, const float *x, const int *incx, const float *y, const int *incy) { +float dot(const int* n, const float* x, const int* incx, const float* y, const int* incy) { return cblas_sdot_wrapper(*n, x, *incx, y, *incy); } template <> -double dot(const int *n, const double *x, const int *incx, const double *y, const int *incy) { +double dot(const int* n, const double* x, const int* incx, const double* y, const int* incy) { return cblas_ddot_wrapper(*n, x, *incx, y, *incy); } template <> -double dot(const int *n, const float *x, const int *incx, const float *y, const int *incy) { +double dot(const int* n, const float* x, const int* incx, const float* y, const int* incy) { return cblas_dsdot_wrapper(*n, x, *incx, y, *incy); } -static float sdsdot(const int *n, const float *sb, const float *x, const int *incx, const float *y, - const int *incy) { +static float sdsdot(const int* n, const float* sb, const float* x, const int* incx, const float* y, + const int* incy) { return cblas_sdsdot_wrapper(*n, *sb, x, *incx, y, *incy); } template -static fp_res nrm2(const int *n, const fp *x, const int *incx); +static fp_res nrm2(const int* n, const fp* x, const int* incx); template <> -float nrm2(const int *n, const float *x, const int *incx) { +float nrm2(const int* n, const float* x, const int* incx) { return cblas_snrm2_wrapper(*n, x, *incx); } template <> -double nrm2(const int *n, const double *x, const int *incx) { +double nrm2(const int* n, const double* x, const int* incx) { return cblas_dnrm2_wrapper(*n, x, *incx); } template <> -float nrm2(const int *n, const std::complex *x, const int *incx) { - return cblas_scnrm2_wrapper(*n, (const void *)x, *incx); +float nrm2(const int* n, const std::complex* x, const int* incx) { + return cblas_scnrm2_wrapper(*n, (const void*)x, *incx); } template <> -double nrm2(const int *n, const std::complex *x, const int *incx) { - return cblas_dznrm2_wrapper(*n, (const void *)x, *incx); +double nrm2(const int* n, const std::complex* x, const int* incx) { + return cblas_dznrm2_wrapper(*n, (const void*)x, *incx); } template -static void rot(const int *n, fp *x, const int *incx, fp *y, const int *incy, const fp_scalar *c, - const fp_scalar *s); +static void rot(const int* n, fp* x, const int* incx, fp* y, const int* incy, const fp_scalar* c, + const fp_scalar* s); template <> -void rot(const int *n, float *x, const int *incx, float *y, const int *incy, const float *c, - const float *s) { +void rot(const int* n, float* x, const int* incx, float* y, const int* incy, const float* c, + const float* s) { cblas_srot_wrapper(*n, x, *incx, y, *incy, *c, *s); } template <> -void rot(const int *n, double *x, const int *incx, double *y, const int *incy, const double *c, - const double *s) { +void rot(const int* n, double* x, const int* incx, double* y, const int* incy, const double* c, + const double* s) { cblas_drot_wrapper(*n, x, *incx, y, *incy, *c, *s); } template <> -void rot(const int *n, std::complex *x, const int *incx, std::complex *y, - const int *incy, const float *c, const float *s) { - csrot_wrapper(n, (void *)x, incx, (void *)y, incy, c, s); +void rot(const int* n, std::complex* x, const int* incx, std::complex* y, + const int* incy, const float* c, const float* s) { + csrot_wrapper(n, (void*)x, incx, (void*)y, incy, c, s); } template <> -void rot(const int *n, std::complex *x, const int *incx, std::complex *y, - const int *incy, const double *c, const double *s) { - zdrot_wrapper(n, (void *)x, incx, (void *)y, incy, c, s); +void rot(const int* n, std::complex* x, const int* incx, std::complex* y, + const int* incy, const double* c, const double* s) { + zdrot_wrapper(n, (void*)x, incx, (void*)y, incy, c, s); } template -static void rotg(fp *a, fp *b, fp_c *c, fp *s); +static void rotg(fp* a, fp* b, fp_c* c, fp* s); template <> -void rotg(float *a, float *b, float *c, float *s) { +void rotg(float* a, float* b, float* c, float* s) { cblas_srotg_wrapper(a, b, c, s); } template <> -void rotg(double *a, double *b, double *c, double *s) { +void rotg(double* a, double* b, double* c, double* s) { cblas_drotg_wrapper(a, b, c, s); } template <> -void rotg(std::complex *a, std::complex *b, float *c, std::complex *s) { - crotg_wrapper((void *)a, (void *)b, c, (void *)s); +void rotg(std::complex* a, std::complex* b, float* c, std::complex* s) { + crotg_wrapper((void*)a, (void*)b, c, (void*)s); } template <> -void rotg(std::complex *a, std::complex *b, double *c, std::complex *s) { - zrotg_wrapper((void *)a, (void *)b, c, (void *)s); +void rotg(std::complex* a, std::complex* b, double* c, std::complex* s) { + zrotg_wrapper((void*)a, (void*)b, c, (void*)s); } template -static void rotm(const int *n, fp *x, const int *incx, fp *y, const int *incy, const fp *param); +static void rotm(const int* n, fp* x, const int* incx, fp* y, const int* incy, const fp* param); template <> -void rotm(const int *n, float *x, const int *incx, float *y, const int *incy, const float *param) { +void rotm(const int* n, float* x, const int* incx, float* y, const int* incy, const float* param) { cblas_srotm_wrapper(*n, x, *incx, y, *incy, param); } template <> -void rotm(const int *n, double *x, const int *incx, double *y, const int *incy, - const double *param) { +void rotm(const int* n, double* x, const int* incx, double* y, const int* incy, + const double* param) { cblas_drotm_wrapper(*n, x, *incx, y, *incy, param); } template -static void rotmg(fp *d1, fp *d2, fp *x1, fp *y1, fp *param); +static void rotmg(fp* d1, fp* d2, fp* x1, fp* y1, fp* param); template <> -void rotmg(float *d1, float *d2, float *x1, float *y1, float *param) { +void rotmg(float* d1, float* d2, float* x1, float* y1, float* param) { cblas_srotmg_wrapper(d1, d2, x1, *y1, param); } template <> -void rotmg(double *d1, double *d2, double *x1, double *y1, double *param) { +void rotmg(double* d1, double* d2, double* x1, double* y1, double* param) { cblas_drotmg_wrapper(d1, d2, x1, *y1, param); } template -static void scal(const int *n, const fp_scalar *alpha, fp_data *x, const int *incx); +static void scal(const int* n, const fp_scalar* alpha, fp_data* x, const int* incx); template <> -void scal(const int *n, const float *alpha, float *x, const int *incx) { +void scal(const int* n, const float* alpha, float* x, const int* incx) { cblas_sscal_wrapper(*n, *alpha, x, *incx); } template <> -void scal(const int *n, const double *alpha, double *x, const int *incx) { +void scal(const int* n, const double* alpha, double* x, const int* incx) { cblas_dscal_wrapper(*n, *alpha, x, *incx); } template <> -void scal(const int *n, const std::complex *alpha, std::complex *x, const int *incx) { - cblas_cscal_wrapper(*n, (const void *)alpha, (void *)x, *incx); +void scal(const int* n, const std::complex* alpha, std::complex* x, const int* incx) { + cblas_cscal_wrapper(*n, (const void*)alpha, (void*)x, *incx); } template <> -void scal(const int *n, const std::complex *alpha, std::complex *x, - const int *incx) { - cblas_zscal_wrapper(*n, (const void *)alpha, (void *)x, *incx); +void scal(const int* n, const std::complex* alpha, std::complex* x, + const int* incx) { + cblas_zscal_wrapper(*n, (const void*)alpha, (void*)x, *incx); } template <> -void scal(const int *n, const float *alpha, std::complex *x, const int *incx) { - cblas_csscal_wrapper(*n, *alpha, (void *)x, *incx); +void scal(const int* n, const float* alpha, std::complex* x, const int* incx) { + cblas_csscal_wrapper(*n, *alpha, (void*)x, *incx); } template <> -void scal(const int *n, const double *alpha, std::complex *x, const int *incx) { - cblas_zdscal_wrapper(*n, *alpha, (void *)x, *incx); +void scal(const int* n, const double* alpha, std::complex* x, const int* incx) { + cblas_zdscal_wrapper(*n, *alpha, (void*)x, *incx); } template -static void swap(const int *n, fp *x, const int *incx, fp *y, const int *incy); +static void swap(const int* n, fp* x, const int* incx, fp* y, const int* incy); template <> -void swap(const int *n, float *x, const int *incx, float *y, const int *incy) { +void swap(const int* n, float* x, const int* incx, float* y, const int* incy) { cblas_sswap_wrapper(*n, x, *incx, y, *incy); } template <> -void swap(const int *n, double *x, const int *incx, double *y, const int *incy) { +void swap(const int* n, double* x, const int* incx, double* y, const int* incy) { cblas_dswap_wrapper(*n, x, *incx, y, *incy); } template <> -void swap(const int *n, std::complex *x, const int *incx, std::complex *y, - const int *incy) { - cblas_cswap_wrapper(*n, (void *)x, *incx, (void *)y, *incy); +void swap(const int* n, std::complex* x, const int* incx, std::complex* y, + const int* incy) { + cblas_cswap_wrapper(*n, (void*)x, *incx, (void*)y, *incy); } template <> -void swap(const int *n, std::complex *x, const int *incx, std::complex *y, - const int *incy) { - cblas_zswap_wrapper(*n, (void *)x, *incx, (void *)y, *incy); +void swap(const int* n, std::complex* x, const int* incx, std::complex* y, + const int* incy) { + cblas_zswap_wrapper(*n, (void*)x, *incx, (void*)y, *incy); } template -static void dotc(fp *pres, const int *n, const fp *x, const int *incx, const fp *y, - const int *incy); +static void dotc(fp* pres, const int* n, const fp* x, const int* incx, const fp* y, + const int* incy); template <> -void dotc(std::complex *pres, const int *n, const std::complex *x, const int *incx, - const std::complex *y, const int *incy) { - cblas_cdotc_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres); +void dotc(std::complex* pres, const int* n, const std::complex* x, const int* incx, + const std::complex* y, const int* incy) { + cblas_cdotc_sub_wrapper(*n, (const void*)x, *incx, (const void*)y, *incy, (void*)pres); } template <> -void dotc(std::complex *pres, const int *n, const std::complex *x, const int *incx, - const std::complex *y, const int *incy) { - cblas_zdotc_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres); +void dotc(std::complex* pres, const int* n, const std::complex* x, const int* incx, + const std::complex* y, const int* incy) { + cblas_zdotc_sub_wrapper(*n, (const void*)x, *incx, (const void*)y, *incy, (void*)pres); } template -static void dotu(fp *pres, const int *n, const fp *x, const int *incx, const fp *y, - const int *incy); +static void dotu(fp* pres, const int* n, const fp* x, const int* incx, const fp* y, + const int* incy); template <> -void dotu(std::complex *pres, const int *n, const std::complex *x, const int *incx, - const std::complex *y, const int *incy) { - cblas_cdotu_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres); +void dotu(std::complex* pres, const int* n, const std::complex* x, const int* incx, + const std::complex* y, const int* incy) { + cblas_cdotu_sub_wrapper(*n, (const void*)x, *incx, (const void*)y, *incy, (void*)pres); } template <> -void dotu(std::complex *pres, const int *n, const std::complex *x, const int *incx, - const std::complex *y, const int *incy) { - cblas_zdotu_sub_wrapper(*n, (const void *)x, *incx, (const void *)y, *incy, (void *)pres); +void dotu(std::complex* pres, const int* n, const std::complex* x, const int* incx, + const std::complex* y, const int* incy) { + cblas_zdotu_sub_wrapper(*n, (const void*)x, *incx, (const void*)y, *incy, (void*)pres); } template -static int iamax(const int *n, const fp *x, const int *incx); +static int iamax(const int* n, const fp* x, const int* incx); template <> -int iamax(const int *n, const float *x, const int *incx) { +int iamax(const int* n, const float* x, const int* incx) { return cblas_isamax_wrapper(*n, x, *incx); } template <> -int iamax(const int *n, const double *x, const int *incx) { +int iamax(const int* n, const double* x, const int* incx) { return cblas_idamax_wrapper(*n, x, *incx); } template <> -int iamax(const int *n, const std::complex *x, const int *incx) { - return cblas_icamax_wrapper(*n, (const void *)x, *incx); +int iamax(const int* n, const std::complex* x, const int* incx) { + return cblas_icamax_wrapper(*n, (const void*)x, *incx); } template <> -int iamax(const int *n, const std::complex *x, const int *incx) { - return cblas_izamax_wrapper(*n, (const void *)x, *incx); +int iamax(const int* n, const std::complex* x, const int* incx) { + return cblas_izamax_wrapper(*n, (const void*)x, *incx); } inline float abs_val(float val) { @@ -1425,10 +1421,10 @@ inline double abs_val(std::complex val) { } template -static int iamin(const int *n, const fp *x, const int *incx); +static int iamin(const int* n, const fp* x, const int* incx); template <> -int iamin(const int *n, const float *x, const int *incx) { +int iamin(const int* n, const float* x, const int* incx) { if (*n < 1 || *incx < 1) { return 0; } @@ -1451,7 +1447,7 @@ int iamin(const int *n, const float *x, const int *incx) { } template <> -int iamin(const int *n, const double *x, const int *incx) { +int iamin(const int* n, const double* x, const int* incx) { if (*n < 1 || *incx < 1) { return 0; } @@ -1474,7 +1470,7 @@ int iamin(const int *n, const double *x, const int *incx) { } template <> -int iamin(const int *n, const std::complex *x, const int *incx) { +int iamin(const int* n, const std::complex* x, const int* incx) { if (*n < 1 || *incx < 1) { return 0; } @@ -1497,7 +1493,7 @@ int iamin(const int *n, const std::complex *x, const int *incx) { } template <> -int iamin(const int *n, const std::complex *x, const int *incx) { +int iamin(const int* n, const std::complex* x, const int* incx) { if (*n < 1 || *incx < 1) { return 0; } @@ -1522,12 +1518,12 @@ int iamin(const int *n, const std::complex *x, const int *incx) { /* Extensions */ template -static void axpby(const int *n, const fp *alpha, const fp *x, const int *incx, const fp *beta, - fp *y, const int *incy); +static void axpby(const int* n, const fp* alpha, const fp* x, const int* incx, const fp* beta, + fp* y, const int* incy); template <> -void axpby(const int *n, const float *alpha, const float *x, const int *incx, const float *beta, - float *y, const int *incy) { +void axpby(const int* n, const float* alpha, const float* x, const int* incx, const float* beta, + float* y, const int* incy) { // Not supported in NETLIB. Reference C++ implementation is used. int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx); int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy); @@ -1536,8 +1532,8 @@ void axpby(const int *n, const float *alpha, const float *x, const int *incx, co } template <> -void axpby(const int *n, const double *alpha, const double *x, const int *incx, const double *beta, - double *y, const int *incy) { +void axpby(const int* n, const double* alpha, const double* x, const int* incx, const double* beta, + double* y, const int* incy) { // Not supported in NETLIB. Reference C++ implementation is used. int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx); int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy); @@ -1546,9 +1542,9 @@ void axpby(const int *n, const double *alpha, const double *x, const int *incx, } template <> -void axpby(const int *n, const std::complex *alpha, const std::complex *x, - const int *incx, const std::complex *beta, std::complex *y, - const int *incy) { +void axpby(const int* n, const std::complex* alpha, const std::complex* x, + const int* incx, const std::complex* beta, std::complex* y, + const int* incy) { // Not supported in NETLIB. Reference C++ implementation is used. int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx); int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy); @@ -1557,9 +1553,9 @@ void axpby(const int *n, const std::complex *alpha, const std::complex -void axpby(const int *n, const std::complex *alpha, const std::complex *x, - const int *incx, const std::complex *beta, std::complex *y, - const int *incy) { +void axpby(const int* n, const std::complex* alpha, const std::complex* x, + const int* incx, const std::complex* beta, std::complex* y, + const int* incy) { // Not supported in NETLIB. Reference C++ implementation is used. int idx = (*incx) > 0 ? 0 : (1 - *n) * (*incx); int idy = (*incy) > 0 ? 0 : (1 - *n) * (*incy); @@ -1569,16 +1565,16 @@ void axpby(const int *n, const std::complex *alpha, const std::complex static void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, - const fps *alpha, const fpa *a, const int *lda, const fpa *ao, const fpb *b, - const int *ldb, const fpb *bo, const fps *beta, fpc *c, const int *ldc, - const fpc *co); + CBLAS_OFFSET offsetc, const int* m, const int* n, const int* k, + const fps* alpha, const fpa* a, const int* lda, const fpa* ao, const fpb* b, + const int* ldb, const fpb* bo, const fps* beta, fpc* c, const int* ldc, + const fpc* co); template <> void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha, - const int8_t *a, const int *lda, const int8_t *ao, const int8_t *b, const int *ldb, - const int8_t *bo, const float *beta, int32_t *c, const int *ldc, const int32_t *co) { + CBLAS_OFFSET offsetc, const int* m, const int* n, const int* k, const float* alpha, + const int8_t* a, const int* lda, const int8_t* ao, const int8_t* b, const int* ldb, + const int8_t* bo, const float* beta, int32_t* c, const int* ldc, const int32_t* co) { // Not supported in NETLIB. DGEMM is used as reference. int sizea, sizeb, sizec; if (layout == CblasColMajor) { @@ -1591,9 +1587,9 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; sizec = *ldc * *m; } - double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); - double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); - double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); + double* ad = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); + double* bd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); + double* cd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); double alphad = *alpha; double betad = *beta; double aod = *ao; @@ -1611,10 +1607,10 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran template <> void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha, - const int8_t *a, const int *lda, const int8_t *ao, const uint8_t *b, const int *ldb, - const uint8_t *bo, const float *beta, int32_t *c, const int *ldc, - const int32_t *co) { + CBLAS_OFFSET offsetc, const int* m, const int* n, const int* k, const float* alpha, + const int8_t* a, const int* lda, const int8_t* ao, const uint8_t* b, const int* ldb, + const uint8_t* bo, const float* beta, int32_t* c, const int* ldc, + const int32_t* co) { // Not supported in NETLIB. DGEMM is used as reference. int sizea, sizeb, sizec; if (layout == CblasColMajor) { @@ -1627,9 +1623,9 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; sizec = *ldc * *m; } - double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); - double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); - double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); + double* ad = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); + double* bd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); + double* cd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); double alphad = *alpha; double betad = *beta; double aod = *ao; @@ -1647,9 +1643,9 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran template <> void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha, - const uint8_t *a, const int *lda, const uint8_t *ao, const int8_t *b, const int *ldb, - const int8_t *bo, const float *beta, int32_t *c, const int *ldc, const int32_t *co) { + CBLAS_OFFSET offsetc, const int* m, const int* n, const int* k, const float* alpha, + const uint8_t* a, const int* lda, const uint8_t* ao, const int8_t* b, const int* ldb, + const int8_t* bo, const float* beta, int32_t* c, const int* ldc, const int32_t* co) { // Not supported in NETLIB. DGEMM is used as reference. int sizea, sizeb, sizec; if (layout == CblasColMajor) { @@ -1662,9 +1658,9 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; sizec = *ldc * *m; } - double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); - double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); - double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); + double* ad = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); + double* bd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); + double* cd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); double alphad = *alpha; double betad = *beta; double aod = *ao; @@ -1682,10 +1678,10 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran template <> void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - CBLAS_OFFSET offsetc, const int *m, const int *n, const int *k, const float *alpha, - const uint8_t *a, const int *lda, const uint8_t *ao, const uint8_t *b, - const int *ldb, const uint8_t *bo, const float *beta, int32_t *c, const int *ldc, - const int32_t *co) { + CBLAS_OFFSET offsetc, const int* m, const int* n, const int* k, const float* alpha, + const uint8_t* a, const int* lda, const uint8_t* ao, const uint8_t* b, + const int* ldb, const uint8_t* bo, const float* beta, int32_t* c, const int* ldc, + const int32_t* co) { // Not supported in NETLIB. DGEMM is used as reference. int sizea, sizeb, sizec; if (layout == CblasColMajor) { @@ -1698,9 +1694,9 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran sizeb = (transb == CblasNoTrans) ? *ldb * *k : *ldb * *n; sizec = *ldc * *m; } - double *ad = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); - double *bd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); - double *cd = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); + double* ad = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizea); + double* bd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizeb); + double* cd = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); double alphad = *alpha; double betad = *beta; double aod = *ao; @@ -1718,19 +1714,19 @@ void gemm_bias(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE tran template static void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, - CBLAS_TRANSPOSE transb, const int *n, const int *k, const fp *alpha, const fp *a, - const int *lda, const fp *b, const int *ldb, const fp *beta, fp *c, - const int *ldc); + CBLAS_TRANSPOSE transb, const int* n, const int* k, const fp* alpha, const fp* a, + const int* lda, const fp* b, const int* ldb, const fp* beta, fp* c, + const int* ldc); template <> void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, - CBLAS_TRANSPOSE transb, const int *n, const int *k, const float *alpha, const float *a, - const int *lda, const float *b, const int *ldb, const float *beta, float *c, - const int *ldc) { + CBLAS_TRANSPOSE transb, const int* n, const int* k, const float* alpha, const float* a, + const int* lda, const float* b, const int* ldb, const float* beta, float* c, + const int* ldc) { // Not supported in NETLIB. SGEMM is used as reference. int sizec; sizec = *ldc * *n; - float *cf = (float *)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec); + float* cf = (float*)oneapi::mkl::aligned_alloc(64, sizeof(float) * sizec); update_c(c, layout, upper_lower, *n, *n, *ldc, cf); cblas_sgemm_wrapper(layout, transa, transb, *n, *n, *k, *alpha, a, *lda, b, *ldb, *beta, cf, *ldc); @@ -1740,13 +1736,13 @@ void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, template <> void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, - CBLAS_TRANSPOSE transb, const int *n, const int *k, const double *alpha, const double *a, - const int *lda, const double *b, const int *ldb, const double *beta, double *c, - const int *ldc) { + CBLAS_TRANSPOSE transb, const int* n, const int* k, const double* alpha, const double* a, + const int* lda, const double* b, const int* ldb, const double* beta, double* c, + const int* ldc) { // Not supported in NETLIB. DGEMM is used as reference. int sizec; sizec = *ldc * *n; - double *cf = (double *)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); + double* cf = (double*)oneapi::mkl::aligned_alloc(64, sizeof(double) * sizec); update_c(c, layout, upper_lower, *n, *n, *ldc, cf); cblas_dgemm_wrapper(layout, transa, transb, *n, *n, *k, *alpha, a, *lda, b, *ldb, *beta, cf, *ldc); @@ -1756,15 +1752,15 @@ void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, template <> void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, - CBLAS_TRANSPOSE transb, const int *n, const int *k, const std::complex *alpha, - const std::complex *a, const int *lda, const std::complex *b, - const int *ldb, const std::complex *beta, std::complex *c, - const int *ldc) { + CBLAS_TRANSPOSE transb, const int* n, const int* k, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, + const int* ldb, const std::complex* beta, std::complex* c, + const int* ldc) { // Not supported in NETLIB. CGEMM is used as reference. int sizec; sizec = *ldc * *n; - std::complex *cf = - (std::complex *)oneapi::mkl::aligned_alloc(64, sizeof(std::complex) * sizec); + std::complex* cf = + (std::complex*)oneapi::mkl::aligned_alloc(64, sizeof(std::complex) * sizec); update_c(c, layout, upper_lower, *n, *n, *ldc, cf); cblas_cgemm_wrapper(layout, transa, transb, *n, *n, *k, alpha, a, *lda, b, *ldb, beta, cf, *ldc); @@ -1774,15 +1770,15 @@ void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, template <> void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, - CBLAS_TRANSPOSE transb, const int *n, const int *k, const std::complex *alpha, - const std::complex *a, const int *lda, const std::complex *b, - const int *ldb, const std::complex *beta, std::complex *c, - const int *ldc) { + CBLAS_TRANSPOSE transb, const int* n, const int* k, const std::complex* alpha, + const std::complex* a, const int* lda, const std::complex* b, + const int* ldb, const std::complex* beta, std::complex* c, + const int* ldc) { // Not supported in NETLIB. ZGEMM is used as reference. int sizec; sizec = *ldc * *n; - std::complex *cf = (std::complex *)oneapi::mkl::aligned_alloc( - 64, sizeof(std::complex) * sizec); + std::complex* cf = + (std::complex*)oneapi::mkl::aligned_alloc(64, sizeof(std::complex) * sizec); update_c(c, layout, upper_lower, *n, *n, *ldc, cf); cblas_zgemm_wrapper(layout, transa, transb, *n, *n, *k, alpha, a, *lda, b, *ldb, beta, cf, *ldc); @@ -1791,12 +1787,12 @@ void gemmt(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE transa, } template -static void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, - const fp *a, const int *lda, const fp *x, const int *incx, fp *c, const int *ldc); +static void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int* m, const int* n, + const fp* a, const int* lda, const fp* x, const int* incx, fp* c, const int* ldc); template <> -void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, const float *a, - const int *lda, const float *x, const int *incx, float *c, const int *ldc) { +void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int* m, const int* n, const float* a, + const int* lda, const float* x, const int* incx, float* c, const int* ldc) { // Not supported in NETLIB. Reference C++ implementation is used. float tmp; int size_x = (left_right == CblasLeft) ? *m : *n; @@ -1827,8 +1823,8 @@ void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n } template <> -void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, const double *a, - const int *lda, const double *x, const int *incx, double *c, const int *ldc) { +void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int* m, const int* n, const double* a, + const int* lda, const double* x, const int* incx, double* c, const int* ldc) { // Not supported in NETLIB. Reference C++ implementation is used. double tmp; int size_x = (left_right == CblasLeft) ? *m : *n; @@ -1859,9 +1855,9 @@ void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n } template <> -void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, - const std::complex *a, const int *lda, const std::complex *x, - const int *incx, std::complex *c, const int *ldc) { +void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int* m, const int* n, + const std::complex* a, const int* lda, const std::complex* x, + const int* incx, std::complex* c, const int* ldc) { // Not supported in NETLIB. Reference C++ implementation is used. std::complex tmp; int size_x = (left_right == CblasLeft) ? *m : *n; @@ -1912,9 +1908,9 @@ void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n } template <> -void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int *m, const int *n, - const std::complex *a, const int *lda, const std::complex *x, - const int *incx, std::complex *c, const int *ldc) { +void dgmm(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, const int* m, const int* n, + const std::complex* a, const int* lda, const std::complex* x, + const int* incx, std::complex* c, const int* ldc) { // Not supported in NETLIB. Reference C++ implementation is used. std::complex tmp; int size_x = (left_right == CblasLeft) ? *m : *n; @@ -1979,7 +1975,7 @@ fp sametype_conj(fp x) { template void omatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int64_t m, int64_t n, - fp alpha, fp *A, int64_t lda, fp *B, int64_t ldb) { + fp alpha, fp* A, int64_t lda, fp* B, int64_t ldb) { int64_t logical_m, logical_n; if (layout == oneapi::mkl::layout::col_major) { logical_m = m; @@ -2014,9 +2010,9 @@ void omatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int6 } template -void omatcopy2_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, const int64_t &m, - const int64_t &n, const fp &alpha, const fp *in_matrix, const int64_t &ld_in, - const int64_t &inc_in, fp *out_matrix, const int64_t &ld_out, +void omatcopy2_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, const int64_t& m, + const int64_t& n, const fp& alpha, const fp* in_matrix, const int64_t& ld_in, + const int64_t& inc_in, fp* out_matrix, const int64_t& ld_out, const int64_t inc_out) { int64_t logical_m, logical_n; if (layout == oneapi::mkl::layout::col_major) { @@ -2061,7 +2057,7 @@ void omatcopy2_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, con template void imatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int64_t m, int64_t n, - fp alpha, fp *A, int64_t lda, int64_t ldb) { + fp alpha, fp* A, int64_t lda, int64_t ldb) { int64_t logical_m, logical_n; if (layout == oneapi::mkl::layout::col_major) { logical_m = m; @@ -2115,8 +2111,8 @@ void imatcopy_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int6 template void omatadd_ref(oneapi::mkl::layout layout, oneapi::mkl::transpose transa, - oneapi::mkl::transpose transb, int64_t m, int64_t n, fp alpha, fp *A, int64_t lda, - fp beta, fp *B, int64_t ldb, fp *C, int64_t ldc) { + oneapi::mkl::transpose transb, int64_t m, int64_t n, fp alpha, fp* A, int64_t lda, + fp beta, fp* B, int64_t ldb, fp* C, int64_t ldc) { int64_t logical_m, logical_n; if (layout == oneapi::mkl::layout::col_major) { logical_m = m; diff --git a/tests/unit_tests/blas/include/reference_blas_wrappers.hpp b/tests/unit_tests/blas/include/reference_blas_wrappers.hpp index 8c7d0938a..f08e78557 100644 --- a/tests/unit_tests/blas/include/reference_blas_wrappers.hpp +++ b/tests/unit_tests/blas/include/reference_blas_wrappers.hpp @@ -27,7 +27,7 @@ #ifdef __linux__ #include -#define LIB_TYPE void * +#define LIB_TYPE void* #define GET_LIB_HANDLE(libname) dlopen((libname), RTLD_LAZY | RTLD_LOCAL | RTLD_DEEPBIND) #define GET_FUNC(lib, fn) dlsym(lib, (fn)) #elif defined(_WIN64) @@ -68,129 +68,129 @@ static LIB_TYPE cblas_library() { static void (*cblas_sgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int m, const int n, const int k, const float alpha, - const float *a, const int lda, const float *b, const int ldb, - const float beta, float *c, const int ldc); + const float* a, const int lda, const float* b, const int ldb, + const float beta, float* c, const int ldc); static void (*cblas_dgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int m, const int n, const int k, const double alpha, - const double *a, const int lda, const double *b, const int ldb, - const double beta, double *c, const int ldc); + const double* a, const int lda, const double* b, const int ldb, + const double beta, double* c, const int ldc); static void (*cblas_cgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, - const void *a, const int lda, const void *b, const int ldb, - const void *beta, void *c, const int ldc); + const int m, const int n, const int k, const void* alpha, + const void* a, const int lda, const void* b, const int ldb, + const void* beta, void* c, const int ldc); static void (*cblas_zgemm_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, - const void *a, const int lda, const void *b, const int ldb, - const void *beta, void *c, const int ldc); + const int m, const int n, const int k, const void* alpha, + const void* a, const int lda, const void* b, const int ldb, + const void* beta, void* c, const int ldc); static void (*cblas_ssymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const float alpha, const float *a, - const int lda, const float *b, const int ldb, const float beta, - float *c, const int ldc); + const int m, const int n, const float alpha, const float* a, + const int lda, const float* b, const int ldb, const float beta, + float* c, const int ldc); static void (*cblas_dsymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const double alpha, const double *a, - const int lda, const double *b, const int ldb, const double beta, - double *c, const int ldc); + const int m, const int n, const double alpha, const double* a, + const int lda, const double* b, const int ldb, const double beta, + double* c, const int ldc); static void (*cblas_csymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc); static void (*cblas_zsymm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc); static void (*cblas_ssyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const float *a, - const int lda, const float beta, float *c, const int ldc); + const int n, const int k, const float alpha, const float* a, + const int lda, const float beta, float* c, const int ldc); static void (*cblas_dsyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const double *a, - const int lda, const double beta, double *c, const int ldc); + const int n, const int k, const double alpha, const double* a, + const int lda, const double beta, double* c, const int ldc); static void (*cblas_csyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc); static void (*cblas_zsyrk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc); static void (*cblas_chemm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc); static void (*cblas_zhemm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc); static void (*cblas_cherk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const void *a, - const int lda, const float beta, void *c, const int ldc); + const int n, const int k, const float alpha, const void* a, + const int lda, const float beta, void* c, const int ldc); static void (*cblas_zherk_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const void *a, - const int lda, const double beta, void *c, const int ldc); + const int n, const int k, const double alpha, const void* a, + const int lda, const double beta, void* c, const int ldc); static void (*cblas_ssyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const float *a, - const int lda, const float *b, const int ldb, const float beta, - float *c, const int ldc); + const int n, const int k, const float alpha, const float* a, + const int lda, const float* b, const int ldb, const float beta, + float* c, const int ldc); static void (*cblas_dsyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const double *a, - const int lda, const double *b, const int ldb, const double beta, - double *c, const int ldc); + const int n, const int k, const double alpha, const double* a, + const int lda, const double* b, const int ldb, const double beta, + double* c, const int ldc); static void (*cblas_csyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc); static void (*cblas_zsyr2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc); static void (*cblas_cher2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const float beta, - void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const float beta, + void* c, const int ldc); static void (*cblas_zher2k_p)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const double beta, - void *c, const int ldc); + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const double beta, + void* c, const int ldc); static void (*cblas_strmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const float alpha, const float *a, const int lda, float *b, + const float alpha, const float* a, const int lda, float* b, const int ldb); static void (*cblas_dtrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const double alpha, const double *a, const int lda, double *b, + const double alpha, const double* a, const int lda, double* b, const int ldb); static void (*cblas_ctrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb); static void (*cblas_ztrmm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb); static void (*cblas_strsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const float alpha, const float *a, const int lda, float *b, + const float alpha, const float* a, const int lda, float* b, const int ldb); static void (*cblas_dtrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const double alpha, const double *a, const int lda, double *b, + const double alpha, const double* a, const int lda, double* b, const int ldb); static void (*cblas_ctrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb); static void (*cblas_ztrsm_p)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb); static void cblas_sgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int m, const int n, const int k, const float alpha, - const float *a, const int lda, const float *b, const int ldb, - const float beta, float *c, const int ldc) { + const float* a, const int lda, const float* b, const int ldb, + const float beta, float* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_sgemm_p == NULL) cblas_sgemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const float alpha, const float *a, - const int lda, const float *b, const int ldb, const float beta, float *c, + const int m, const int n, const int k, const float alpha, const float* a, + const int lda, const float* b, const int ldb, const float beta, float* c, const int ldc))GET_FUNC(h_libcblas, "cblas_sgemm"); if (cblas_sgemm_p != NULL) cblas_sgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -199,14 +199,14 @@ static void cblas_sgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBL static void cblas_dgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int m, const int n, const int k, const double alpha, - const double *a, const int lda, const double *b, const int ldb, - const double beta, double *c, const int ldc) { + const double* a, const int lda, const double* b, const int ldb, + const double beta, double* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_dgemm_p == NULL) cblas_dgemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, const int m, const int n, const int k, - const double alpha, const double *a, const int lda, - const double *b, const int ldb, const double beta, double *c, + const double alpha, const double* a, const int lda, + const double* b, const int ldb, const double beta, double* c, const int ldc))GET_FUNC(h_libcblas, "cblas_dgemm"); if (cblas_dgemm_p != NULL) cblas_dgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -214,15 +214,15 @@ static void cblas_dgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBL } static void cblas_cgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, - const void *a, const int lda, const void *b, const int ldb, - const void *beta, void *c, const int ldc) { + const int m, const int n, const int k, const void* alpha, + const void* a, const int lda, const void* b, const int ldb, + const void* beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_cgemm_p == NULL) cblas_cgemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_cgemm"); if (cblas_cgemm_p != NULL) cblas_cgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -230,15 +230,15 @@ static void cblas_cgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBL } static void cblas_zgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, - const void *a, const int lda, const void *b, const int ldb, - const void *beta, void *c, const int ldc) { + const int m, const int n, const int k, const void* alpha, + const void* a, const int lda, const void* b, const int ldb, + const void* beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zgemm_p == NULL) cblas_zgemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBLAS_TRANSPOSE transb, - const int m, const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, void *c, + const int m, const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zgemm"); if (cblas_zgemm_p != NULL) cblas_zgemm_p(layout, transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -246,15 +246,15 @@ static void cblas_zgemm_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE transa, CBL } static void cblas_ssymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const float alpha, const float *a, - const int lda, const float *b, const int ldb, const float beta, - float *c, const int ldc) { + const int m, const int n, const float alpha, const float* a, + const int lda, const float* b, const int ldb, const float beta, + float* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_ssymm_p == NULL) cblas_ssymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int m, - const int n, const float alpha, const float *a, const int lda, - const float *b, const int ldb, const float beta, float *c, + const int n, const float alpha, const float* a, const int lda, + const float* b, const int ldb, const float beta, float* c, const int ldc))GET_FUNC(h_libcblas, "cblas_ssymm"); if (cblas_ssymm_p != NULL) cblas_ssymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); @@ -262,15 +262,15 @@ static void cblas_ssymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLA } static void cblas_dsymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const double alpha, const double *a, - const int lda, const double *b, const int ldb, const double beta, - double *c, const int ldc) { + const int m, const int n, const double alpha, const double* a, + const int lda, const double* b, const int ldb, const double beta, + double* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_dsymm_p == NULL) cblas_dsymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, const int m, - const int n, const double alpha, const double *a, const int lda, - const double *b, const int ldb, const double beta, double *c, + const int n, const double alpha, const double* a, const int lda, + const double* b, const int ldb, const double beta, double* c, const int ldc))GET_FUNC(h_libcblas, "cblas_dsymm"); if (cblas_dsymm_p != NULL) cblas_dsymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); @@ -278,43 +278,43 @@ static void cblas_dsymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLA } static void cblas_csymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_csymm_p == NULL) cblas_csymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_csymm"); + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_csymm"); if (cblas_csymm_p != NULL) cblas_csymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } } static void cblas_zsymm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zsymm_p == NULL) cblas_zsymm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_zsymm"); + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zsymm"); if (cblas_zsymm_p != NULL) cblas_zsymm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } } static void cblas_ssyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const float *a, - const int lda, const float beta, float *c, const int ldc) { + const int n, const int k, const float alpha, const float* a, + const int lda, const float beta, float* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_ssyrk_p == NULL) cblas_ssyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const float *a, - const int lda, const float beta, float *c, + const int n, const int k, const float alpha, const float* a, + const int lda, const float beta, float* c, const int ldc))GET_FUNC(h_libcblas, "cblas_ssyrk"); if (cblas_ssyrk_p != NULL) cblas_ssyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -322,13 +322,13 @@ static void cblas_ssyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_dsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const double *a, - const int lda, const double beta, double *c, const int ldc) { + const int n, const int k, const double alpha, const double* a, + const int lda, const double beta, double* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_dsyrk_p == NULL) cblas_dsyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const double *a, - const int lda, const double beta, double *c, + const int n, const int k, const double alpha, const double* a, + const int lda, const double beta, double* c, const int ldc))GET_FUNC(h_libcblas, "cblas_dsyrk"); if (cblas_dsyrk_p != NULL) cblas_dsyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -336,13 +336,13 @@ static void cblas_dsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_csyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_csyrk_p == NULL) cblas_csyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_csyrk"); if (cblas_csyrk_p != NULL) cblas_csyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -350,13 +350,13 @@ static void cblas_csyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_zsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zsyrk_p == NULL) cblas_zsyrk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *beta, void *c, + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zsyrk"); if (cblas_zsyrk_p != NULL) cblas_zsyrk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -364,43 +364,43 @@ static void cblas_zsyrk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_chemm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_chemm_p == NULL) cblas_chemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_chemm"); + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_chemm"); if (cblas_chemm_p != NULL) cblas_chemm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } } static void cblas_zhemm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zhemm_p == NULL) cblas_zhemm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE left_right, CBLAS_UPLO uplo, - const int m, const int n, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc))GET_FUNC(h_libcblas, "cblas_zhemm"); + const int m, const int n, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zhemm"); if (cblas_zhemm_p != NULL) cblas_zhemm_p(layout, left_right, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc); } } static void cblas_cherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const void *a, - const int lda, const float beta, void *c, const int ldc) { + const int n, const int k, const float alpha, const void* a, + const int lda, const float beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_cherk_p == NULL) cblas_cherk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const void *a, - const int lda, const float beta, void *c, + const int n, const int k, const float alpha, const void* a, + const int lda, const float beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_cherk"); if (cblas_cherk_p != NULL) cblas_cherk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -408,13 +408,13 @@ static void cblas_cherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_zherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const void *a, - const int lda, const double beta, void *c, const int ldc) { + const int n, const int k, const double alpha, const void* a, + const int lda, const double beta, void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zherk_p == NULL) cblas_zherk_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const void *a, - const int lda, const double beta, void *c, + const int n, const int k, const double alpha, const void* a, + const int lda, const double beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zherk"); if (cblas_zherk_p != NULL) cblas_zherk_p(layout, uplo, trans, n, k, alpha, a, lda, beta, c, ldc); @@ -422,15 +422,15 @@ static void cblas_zherk_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRAN } static void cblas_ssyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const float alpha, const float *a, - const int lda, const float *b, const int ldb, const float beta, - float *c, const int ldc) { + const int n, const int k, const float alpha, const float* a, + const int lda, const float* b, const int ldb, const float beta, + float* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_ssyr2k_p == NULL) cblas_ssyr2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const float alpha, const float *a, const int lda, - const float *b, const int ldb, const float beta, float *c, + const int k, const float alpha, const float* a, const int lda, + const float* b, const int ldb, const float beta, float* c, const int ldc))GET_FUNC(h_libcblas, "cblas_ssyr2k"); if (cblas_ssyr2k_p != NULL) cblas_ssyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -438,15 +438,15 @@ static void cblas_ssyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA } static void cblas_dsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const double alpha, const double *a, - const int lda, const double *b, const int ldb, const double beta, - double *c, const int ldc) { + const int n, const int k, const double alpha, const double* a, + const int lda, const double* b, const int ldb, const double beta, + double* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_dsyr2k_p == NULL) cblas_dsyr2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const double alpha, const double *a, const int lda, - const double *b, const int ldb, const double beta, double *c, + const int k, const double alpha, const double* a, const int lda, + const double* b, const int ldb, const double beta, double* c, const int ldc))GET_FUNC(h_libcblas, "cblas_dsyr2k"); if (cblas_dsyr2k_p != NULL) cblas_dsyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -454,15 +454,15 @@ static void cblas_dsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA } static void cblas_csyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_csyr2k_p == NULL) cblas_csyr2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *b, const int ldb, const void *beta, void *c, + const int k, const void* alpha, const void* a, const int lda, + const void* b, const int ldb, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_csyr2k"); if (cblas_csyr2k_p != NULL) cblas_csyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -470,15 +470,15 @@ static void cblas_csyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA } static void cblas_zsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const void *beta, - void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const void* beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zsyr2k_p == NULL) cblas_zsyr2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *b, const int ldb, const void *beta, void *c, + const int k, const void* alpha, const void* a, const int lda, + const void* b, const int ldb, const void* beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zsyr2k"); if (cblas_zsyr2k_p != NULL) cblas_zsyr2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -486,15 +486,15 @@ static void cblas_zsyr2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA } static void cblas_cher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const float beta, - void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const float beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_cher2k_p == NULL) cblas_cher2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *b, const int ldb, const float beta, void *c, + const int k, const void* alpha, const void* a, const int lda, + const void* b, const int ldb, const float beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_cher2k"); if (cblas_cher2k_p != NULL) cblas_cher2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -502,15 +502,15 @@ static void cblas_cher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA } static void cblas_zher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, - const int n, const int k, const void *alpha, const void *a, - const int lda, const void *b, const int ldb, const double beta, - void *c, const int ldc) { + const int n, const int k, const void* alpha, const void* a, + const int lda, const void* b, const int ldb, const double beta, + void* c, const int ldc) { if (cblas_library() != NULL) { if (cblas_zher2k_p == NULL) cblas_zher2k_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *b, const int ldb, const double beta, void *c, + const int k, const void* alpha, const void* a, const int lda, + const void* b, const int ldb, const double beta, void* c, const int ldc))GET_FUNC(h_libcblas, "cblas_zher2k"); if (cblas_zher2k_p != NULL) cblas_zher2k_p(layout, uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc); @@ -519,14 +519,14 @@ static void cblas_zher2k_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO uplo, CBLAS_TRA static void cblas_strmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const float alpha, const float *a, const int lda, float *b, + const float alpha, const float* a, const int lda, float* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_strmm_p == NULL) cblas_strmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const float alpha, const float *a, const int lda, - float *b, const int ldb))GET_FUNC(h_libcblas, "cblas_strmm"); + const int n, const float alpha, const float* a, const int lda, + float* b, const int ldb))GET_FUNC(h_libcblas, "cblas_strmm"); if (cblas_strmm_p != NULL) cblas_strmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -534,14 +534,14 @@ static void cblas_strmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_dtrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const double alpha, const double *a, const int lda, double *b, + const double alpha, const double* a, const int lda, double* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_dtrmm_p == NULL) cblas_dtrmm_p = (void (*)( CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int m, const int n, const double alpha, const double *a, - const int lda, double *b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrmm"); + CBLAS_DIAG diag, const int m, const int n, const double alpha, const double* a, + const int lda, double* b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrmm"); if (cblas_dtrmm_p != NULL) cblas_dtrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -549,14 +549,14 @@ static void cblas_dtrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_ctrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_ctrmm_p == NULL) cblas_ctrmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const void *alpha, const void *a, const int lda, - void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrmm"); + const int n, const void* alpha, const void* a, const int lda, + void* b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrmm"); if (cblas_ctrmm_p != NULL) cblas_ctrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -564,14 +564,14 @@ static void cblas_ctrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_ztrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_ztrmm_p == NULL) cblas_ztrmm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const void *alpha, const void *a, const int lda, - void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrmm"); + const int n, const void* alpha, const void* a, const int lda, + void* b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrmm"); if (cblas_ztrmm_p != NULL) cblas_ztrmm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -579,14 +579,14 @@ static void cblas_ztrmm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_strsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const float alpha, const float *a, const int lda, float *b, + const float alpha, const float* a, const int lda, float* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_strsm_p == NULL) cblas_strsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const float alpha, const float *a, const int lda, - float *b, const int ldb))GET_FUNC(h_libcblas, "cblas_strsm"); + const int n, const float alpha, const float* a, const int lda, + float* b, const int ldb))GET_FUNC(h_libcblas, "cblas_strsm"); if (cblas_strsm_p != NULL) cblas_strsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -594,14 +594,14 @@ static void cblas_strsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_dtrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const double alpha, const double *a, const int lda, double *b, + const double alpha, const double* a, const int lda, double* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_dtrsm_p == NULL) cblas_dtrsm_p = (void (*)( CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, - CBLAS_DIAG diag, const int m, const int n, const double alpha, const double *a, - const int lda, double *b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrsm"); + CBLAS_DIAG diag, const int m, const int n, const double alpha, const double* a, + const int lda, double* b, const int ldb))GET_FUNC(h_libcblas, "cblas_dtrsm"); if (cblas_dtrsm_p != NULL) cblas_dtrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -609,14 +609,14 @@ static void cblas_dtrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_ctrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_ctrsm_p == NULL) cblas_ctrsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const void *alpha, const void *a, const int lda, - void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrsm"); + const int n, const void* alpha, const void* a, const int lda, + void* b, const int ldb))GET_FUNC(h_libcblas, "cblas_ctrsm"); if (cblas_ctrsm_p != NULL) cblas_ctrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -624,14 +624,14 @@ static void cblas_ctrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO static void cblas_ztrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, const int n, - const void *alpha, const void *a, const int lda, void *b, + const void* alpha, const void* a, const int lda, void* b, const int ldb) { if (cblas_library() != NULL) { if (cblas_ztrsm_p == NULL) cblas_ztrsm_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO uplo, CBLAS_TRANSPOSE transa, CBLAS_DIAG diag, const int m, - const int n, const void *alpha, const void *a, const int lda, - void *b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrsm"); + const int n, const void* alpha, const void* a, const int lda, + void* b, const int ldb))GET_FUNC(h_libcblas, "cblas_ztrsm"); if (cblas_ztrsm_p != NULL) cblas_ztrsm_p(layout, side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); } @@ -640,213 +640,213 @@ static void cblas_ztrsm_wrapper(CBLAS_LAYOUT layout, CBLAS_SIDE side, CBLAS_UPLO /* Level 2 */ static void (*cblas_sgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - const float alpha, const float *a, const int lda, const float *x, - const int incx, const float beta, float *y, const int incy); + const float alpha, const float* a, const int lda, const float* x, + const int incx, const float beta, float* y, const int incy); static void (*cblas_dgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - const double alpha, const double *a, const int lda, const double *x, - const int incx, const double beta, double *y, const int incy); + const double alpha, const double* a, const int lda, const double* x, + const int incx, const double beta, double* y, const int incy); static void (*cblas_cgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_zgemv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_sgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + int kl, int ku, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy); static void (*cblas_dgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const double alpha, const double *a, const int lda, - const double *x, const int incx, const double beta, double *y, + int kl, int ku, const double alpha, const double* a, const int lda, + const double* x, const int incx, const double beta, double* y, const int incy); static void (*cblas_cgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + int kl, int ku, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy); static void (*cblas_zgbmv_p)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + int kl, int ku, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy); static void (*cblas_sger_p)(CBLAS_LAYOUT layout, const int m, const int n, const float alpha, - const float *x, const int incx, const float *y, const int incy, - float *a, const int lda); + const float* x, const int incx, const float* y, const int incy, + float* a, const int lda); static void (*cblas_dger_p)(CBLAS_LAYOUT layout, const int m, const int n, const double alpha, - const double *x, const int incx, const double *y, const int incy, - double *a, const int lda); -static void (*cblas_cgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, + const double* x, const int incx, const double* y, const int incy, + double* a, const int lda); +static void (*cblas_cgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda); -static void (*cblas_zgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, +static void (*cblas_zgerc_p)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda); -static void (*cblas_cgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, +static void (*cblas_cgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda); -static void (*cblas_zgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, +static void (*cblas_zgeru_p)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda); static void (*cblas_chbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_zhbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_chemv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_zhemv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy); + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy); static void (*cblas_cher_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, void *a, + const float alpha, const void* x, const int incx, void* a, const int lda); static void (*cblas_zher_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, void *a, + const double alpha, const void* x, const int incx, void* a, const int lda); static void (*cblas_cher2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a, const int lda); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a, const int lda); static void (*cblas_zher2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a, const int lda); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a, const int lda); static void (*cblas_chpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, const int incx, - const void *beta, void *y, const int incy); + const void* alpha, const void* a, const void* x, const int incx, + const void* beta, void* y, const int incy); static void (*cblas_zhpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, const int incx, - const void *beta, void *y, const int incy); + const void* alpha, const void* a, const void* x, const int incx, + const void* beta, void* y, const int incy); static void (*cblas_chpr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, void *a); + const float alpha, const void* x, const int incx, void* a); static void (*cblas_zhpr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, void *a); + const double alpha, const void* x, const int incx, void* a); static void (*cblas_chpr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a); static void (*cblas_zhpr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a); static void (*cblas_ssbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k, - const float alpha, const float *a, const int lda, const float *x, - const int incx, const float beta, float *y, const int incy); + const float alpha, const float* a, const int lda, const float* x, + const int incx, const float beta, float* y, const int incy); static void (*cblas_dsbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k, - const double alpha, const double *a, const int lda, const double *x, - const int incx, const double beta, double *y, const int incy); + const double alpha, const double* a, const int lda, const double* x, + const int incx, const double beta, double* y, const int incy); static void (*cblas_ssymv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const int lda, const float *x, - const int incx, const float beta, float *y, const int incy); + const float alpha, const float* a, const int lda, const float* x, + const int incx, const float beta, float* y, const int incy); static void (*cblas_dsymv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const int lda, const double *x, - const int incx, const double beta, double *y, const int incy); + const double alpha, const double* a, const int lda, const double* x, + const int incx, const double beta, double* y, const int incy); static void (*cblas_ssyr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, float *a, + const float alpha, const float* x, const int incx, float* a, const int lda); static void (*cblas_dsyr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, double *a, + const double alpha, const double* x, const int incx, double* a, const int lda); static void (*cblas_ssyr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, const float *y, - const int incy, float *a, const int lda); + const float alpha, const float* x, const int incx, const float* y, + const int incy, float* a, const int lda); static void (*cblas_dsyr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, const double *y, - const int incy, double *a, const int lda); + const double alpha, const double* x, const int incx, const double* y, + const int incy, double* a, const int lda); static void (*cblas_sspmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const float *x, const int incx, - const float beta, float *y, const int incy); + const float alpha, const float* a, const float* x, const int incx, + const float beta, float* y, const int incy); static void (*cblas_dspmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const double *x, const int incx, - const double beta, double *y, const int incy); + const double alpha, const double* a, const double* x, const int incx, + const double beta, double* y, const int incy); static void (*cblas_sspr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, float *a); + const float alpha, const float* x, const int incx, float* a); static void (*cblas_dspr_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, double *a); + const double alpha, const double* x, const int incx, double* a); static void (*cblas_sspr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, const float *y, - const int incy, float *a); + const float alpha, const float* x, const int incx, const float* y, + const int incy, float* a); static void (*cblas_dspr2_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, const double *y, - const int incy, double *a); + const double alpha, const double* x, const int incx, const double* y, + const int incy, double* a); static void (*cblas_stbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const float *a, - const int lda, float *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const float* a, + const int lda, float* x, const int incx); static void (*cblas_dtbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const double *a, - const int lda, double *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const double* a, + const int lda, double* x, const int incx); static void (*cblas_ctbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx); static void (*cblas_ztbmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx); static void (*cblas_stbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const float *a, - const int lda, float *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const float* a, + const int lda, float* x, const int incx); static void (*cblas_dtbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const double *a, - const int lda, double *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const double* a, + const int lda, double* x, const int incx); static void (*cblas_ctbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx); static void (*cblas_ztbsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx); static void (*cblas_stpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx); static void (*cblas_dtpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx); static void (*cblas_ctpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx); static void (*cblas_ztpmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx); static void (*cblas_stpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx); static void (*cblas_dtpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx); static void (*cblas_ctpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx); static void (*cblas_ztpsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx); static void (*cblas_strmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx); static void (*cblas_dtrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx); static void (*cblas_ctrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx); static void (*cblas_ztrmv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx); static void (*cblas_strsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx); static void (*cblas_dtrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx); static void (*cblas_ctrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx); static void (*cblas_ztrsv_p)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx); + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx); static void cblas_sgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + const int n, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_sgemv_p == NULL) cblas_sgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + const int n, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sgemv"); if (cblas_sgemv_p != NULL) cblas_sgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); @@ -854,29 +854,29 @@ static void cblas_sgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_dgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const double alpha, const double *a, const int lda, - const double *x, const int incx, const double beta, double *y, + const int n, const double alpha, const double* a, const int lda, + const double* x, const int incx, const double beta, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dgemv_p == NULL) cblas_dgemv_p = (void (*)( CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - const double alpha, const double *a, const int lda, const double *x, const int incx, - const double beta, double *y, const int incy))GET_FUNC(h_libcblas, "cblas_dgemv"); + const double alpha, const double* a, const int lda, const double* x, const int incx, + const double beta, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dgemv"); if (cblas_dgemv_p != NULL) cblas_dgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); } } static void cblas_cgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int n, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_cgemv_p == NULL) cblas_cgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int n, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_cgemv"); if (cblas_cgemv_p != NULL) cblas_cgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); @@ -884,14 +884,14 @@ static void cblas_cgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_zgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int n, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zgemv_p == NULL) cblas_zgemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int n, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zgemv"); if (cblas_zgemv_p != NULL) cblas_zgemv_p(layout, trans, m, n, alpha, a, lda, x, incx, beta, y, incy); @@ -899,15 +899,15 @@ static void cblas_zgemv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_sgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, int kl, int ku, const float alpha, const float *a, - const int lda, const float *x, const int incx, const float beta, - float *y, const int incy) { + const int n, int kl, int ku, const float alpha, const float* a, + const int lda, const float* x, const int incx, const float beta, + float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_sgbmv_p == NULL) cblas_sgbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + int kl, int ku, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sgbmv"); if (cblas_sgbmv_p != NULL) cblas_sgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); @@ -915,15 +915,15 @@ static void cblas_sgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_dgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, int kl, int ku, const double alpha, const double *a, - const int lda, const double *x, const int incx, const double beta, - double *y, const int incy) { + const int n, int kl, int ku, const double alpha, const double* a, + const int lda, const double* x, const int incx, const double beta, + double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dgbmv_p == NULL) cblas_dgbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const double alpha, const double *a, const int lda, - const double *x, const int incx, const double beta, double *y, + int kl, int ku, const double alpha, const double* a, const int lda, + const double* x, const int incx, const double beta, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dgbmv"); if (cblas_dgbmv_p != NULL) cblas_dgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); @@ -931,15 +931,15 @@ static void cblas_dgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_cgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, int kl, int ku, const void *alpha, const void *a, - const int lda, const void *x, const int incx, const void *beta, - void *y, const int incy) { + const int n, int kl, int ku, const void* alpha, const void* a, + const int lda, const void* x, const int incx, const void* beta, + void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_cgbmv_p == NULL) cblas_cgbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + int kl, int ku, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_cgbmv"); if (cblas_cgbmv_p != NULL) cblas_cgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); @@ -947,15 +947,15 @@ static void cblas_cgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_zgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, - const int n, int kl, int ku, const void *alpha, const void *a, - const int lda, const void *x, const int incx, const void *beta, - void *y, const int incy) { + const int n, int kl, int ku, const void* alpha, const void* a, + const int lda, const void* x, const int incx, const void* beta, + void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zgbmv_p == NULL) cblas_zgbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, const int m, const int n, - int kl, int ku, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + int kl, int ku, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zgbmv"); if (cblas_zgbmv_p != NULL) cblas_zgbmv_p(layout, trans, m, n, kl, ku, alpha, a, lda, x, incx, beta, y, incy); @@ -963,13 +963,13 @@ static void cblas_zgbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, cons } static void cblas_sger_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const float alpha, - const float *x, const int incx, const float *y, const int incy, - float *a, const int lda) { + const float* x, const int incx, const float* y, const int incy, + float* a, const int lda) { if (cblas_library() != NULL) { if (cblas_sger_p == NULL) cblas_sger_p = (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const float alpha, - const float *x, const int incx, const float *y, const int incy, float *a, + const float* x, const int incx, const float* y, const int incy, float* a, const int lda))GET_FUNC(h_libcblas, "cblas_sger"); if (cblas_sger_p != NULL) cblas_sger_p(layout, m, n, alpha, x, incx, y, incy, a, lda); @@ -977,69 +977,69 @@ static void cblas_sger_wrapper(CBLAS_LAYOUT layout, const int m, const int n, co } static void cblas_dger_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const double alpha, - const double *x, const int incx, const double *y, const int incy, - double *a, const int lda) { + const double* x, const int incx, const double* y, const int incy, + double* a, const int lda) { if (cblas_library() != NULL) { if (cblas_dger_p == NULL) cblas_dger_p = (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const double alpha, - const double *x, const int incx, const double *y, const int incy, - double *a, const int lda))GET_FUNC(h_libcblas, "cblas_dger"); + const double* x, const int incx, const double* y, const int incy, + double* a, const int lda))GET_FUNC(h_libcblas, "cblas_dger"); if (cblas_dger_p != NULL) cblas_dger_p(layout, m, n, alpha, x, incx, y, incy, a, lda); } } -static void cblas_cgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, - void *a, const int lda) { +static void cblas_cgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, + void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_cgerc_p == NULL) cblas_cgerc_p = - (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, + (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_cgerc"); if (cblas_cgerc_p != NULL) cblas_cgerc_p(layout, m, n, alpha, x, incx, y, incy, a, lda); } } -static void cblas_zgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, - void *a, const int lda) { +static void cblas_zgerc_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, + void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_zgerc_p == NULL) cblas_zgerc_p = - (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, + (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_zgerc"); if (cblas_zgerc_p != NULL) cblas_zgerc_p(layout, m, n, alpha, x, incx, y, incy, a, lda); } } -static void cblas_cgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, - void *a, const int lda) { +static void cblas_cgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, + void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_cgeru_p == NULL) cblas_cgeru_p = - (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, + (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_cgeru"); if (cblas_cgeru_p != NULL) cblas_cgeru_p(layout, m, n, alpha, x, incx, y, incy, a, lda); } } -static void cblas_zgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, - void *a, const int lda) { +static void cblas_zgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, + void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_zgeru_p == NULL) cblas_zgeru_p = - (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void *alpha, - const void *x, const int incx, const void *y, const int incy, void *a, + (void (*)(CBLAS_LAYOUT layout, const int m, const int n, const void* alpha, + const void* x, const int incx, const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_zgeru"); if (cblas_zgeru_p != NULL) cblas_zgeru_p(layout, m, n, alpha, x, incx, y, incy, a, lda); @@ -1047,14 +1047,14 @@ static void cblas_zgeru_wrapper(CBLAS_LAYOUT layout, const int m, const int n, c } static void cblas_chbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int k, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_chbmv_p == NULL) cblas_chbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int k, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_chbmv"); if (cblas_chbmv_p != NULL) cblas_chbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); @@ -1062,14 +1062,14 @@ static void cblas_chbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_zhbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int k, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zhbmv_p == NULL) cblas_zhbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const int k, const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zhbmv"); if (cblas_zhbmv_p != NULL) cblas_zhbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); @@ -1077,13 +1077,13 @@ static void cblas_zhbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_chemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy) { + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_chemv_p == NULL) cblas_chemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_chemv"); if (cblas_chemv_p != NULL) cblas_chemv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); @@ -1091,13 +1091,13 @@ static void cblas_chemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_zhemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, const void *x, - const int incx, const void *beta, void *y, const int incy) { + const void* alpha, const void* a, const int lda, const void* x, + const int incx, const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zhemv_p == NULL) cblas_zhemv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const int lda, - const void *x, const int incx, const void *beta, void *y, + const void* alpha, const void* a, const int lda, + const void* x, const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zhemv"); if (cblas_zhemv_p != NULL) cblas_zhemv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); @@ -1105,12 +1105,12 @@ static void cblas_zhemv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_cher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, void *a, + const float alpha, const void* x, const int incx, void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_cher_p == NULL) cblas_cher_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, void *a, + const float alpha, const void* x, const int incx, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_cher"); if (cblas_cher_p != NULL) cblas_cher_p(layout, upper_lower, n, alpha, x, incx, a, lda); @@ -1118,12 +1118,12 @@ static void cblas_cher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, cons } static void cblas_zher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, void *a, + const double alpha, const void* x, const int incx, void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_zher_p == NULL) cblas_zher_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, void *a, + const double alpha, const void* x, const int incx, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_zher"); if (cblas_zher_p != NULL) cblas_zher_p(layout, upper_lower, n, alpha, x, incx, a, lda); @@ -1131,13 +1131,13 @@ static void cblas_zher_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, cons } static void cblas_cher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a, const int lda) { + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_cher2_p == NULL) cblas_cher2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, - const void *y, const int incy, void *a, + const void* alpha, const void* x, const int incx, + const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_cher2"); if (cblas_cher2_p != NULL) cblas_cher2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda); @@ -1145,13 +1145,13 @@ static void cblas_cher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_zher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a, const int lda) { + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a, const int lda) { if (cblas_library() != NULL) { if (cblas_zher2_p == NULL) cblas_zher2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, - const void *y, const int incy, void *a, + const void* alpha, const void* x, const int incx, + const void* y, const int incy, void* a, const int lda))GET_FUNC(h_libcblas, "cblas_zher2"); if (cblas_zher2_p != NULL) cblas_zher2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda); @@ -1159,13 +1159,13 @@ static void cblas_zher2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_chpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, const int incx, - const void *beta, void *y, const int incy) { + const void* alpha, const void* a, const void* x, const int incx, + const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_chpmv_p == NULL) cblas_chpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, - const int incx, const void *beta, void *y, + const void* alpha, const void* a, const void* x, + const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_chpmv"); if (cblas_chpmv_p != NULL) cblas_chpmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy); @@ -1173,13 +1173,13 @@ static void cblas_chpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_zhpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, const int incx, - const void *beta, void *y, const int incy) { + const void* alpha, const void* a, const void* x, const int incx, + const void* beta, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zhpmv_p == NULL) cblas_zhpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *a, const void *x, - const int incx, const void *beta, void *y, + const void* alpha, const void* a, const void* x, + const int incx, const void* beta, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zhpmv"); if (cblas_zhpmv_p != NULL) cblas_zhpmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy); @@ -1187,66 +1187,66 @@ static void cblas_zhpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_chpr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, void *a) { + const float alpha, const void* x, const int incx, void* a) { if (cblas_library() != NULL) { if (cblas_chpr_p == NULL) cblas_chpr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const void *x, const int incx, - void *a))GET_FUNC(h_libcblas, "cblas_chpr"); + const float alpha, const void* x, const int incx, + void* a))GET_FUNC(h_libcblas, "cblas_chpr"); if (cblas_chpr_p != NULL) cblas_chpr_p(layout, upper_lower, n, alpha, x, incx, a); } } static void cblas_zhpr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, void *a) { + const double alpha, const void* x, const int incx, void* a) { if (cblas_library() != NULL) { if (cblas_zhpr_p == NULL) cblas_zhpr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const void *x, const int incx, - void *a))GET_FUNC(h_libcblas, "cblas_zhpr"); + const double alpha, const void* x, const int incx, + void* a))GET_FUNC(h_libcblas, "cblas_zhpr"); if (cblas_zhpr_p != NULL) cblas_zhpr_p(layout, upper_lower, n, alpha, x, incx, a); } } static void cblas_chpr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a) { + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a) { if (cblas_library() != NULL) { if (cblas_chpr2_p == NULL) cblas_chpr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a))GET_FUNC(h_libcblas, "cblas_chpr2"); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a))GET_FUNC(h_libcblas, "cblas_chpr2"); if (cblas_chpr2_p != NULL) cblas_chpr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a); } } static void cblas_zhpr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a) { + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a) { if (cblas_library() != NULL) { if (cblas_zhpr2_p == NULL) cblas_zhpr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const void *alpha, const void *x, const int incx, const void *y, - const int incy, void *a))GET_FUNC(h_libcblas, "cblas_zhpr2"); + const void* alpha, const void* x, const int incx, const void* y, + const int incy, void* a))GET_FUNC(h_libcblas, "cblas_zhpr2"); if (cblas_zhpr2_p != NULL) cblas_zhpr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a); } } static void cblas_ssbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + const int k, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_ssbmv_p == NULL) cblas_ssbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + const int k, const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_ssbmv"); if (cblas_ssbmv_p != NULL) cblas_ssbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); @@ -1254,28 +1254,28 @@ static void cblas_ssbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_dsbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const int k, const double alpha, const double *a, const int lda, - const double *x, const int incx, const double beta, double *y, + const int k, const double alpha, const double* a, const int lda, + const double* x, const int incx, const double beta, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dsbmv_p == NULL) cblas_dsbmv_p = (void (*)( CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, const int k, - const double alpha, const double *a, const int lda, const double *x, const int incx, - const double beta, double *y, const int incy))GET_FUNC(h_libcblas, "cblas_dsbmv"); + const double alpha, const double* a, const int lda, const double* x, const int incx, + const double beta, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dsbmv"); if (cblas_dsbmv_p != NULL) cblas_dsbmv_p(layout, upper_lower, n, k, alpha, a, lda, x, incx, beta, y, incy); } } static void cblas_ssymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const int lda, const float *x, - const int incx, const float beta, float *y, const int incy) { + const float alpha, const float* a, const int lda, const float* x, + const int incx, const float beta, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_ssymv_p == NULL) cblas_ssymv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const int lda, - const float *x, const int incx, const float beta, float *y, + const float alpha, const float* a, const int lda, + const float* x, const int incx, const float beta, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_ssymv"); if (cblas_ssymv_p != NULL) cblas_ssymv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); @@ -1283,13 +1283,13 @@ static void cblas_ssymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_dsymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const int lda, const double *x, - const int incx, const double beta, double *y, const int incy) { + const double alpha, const double* a, const int lda, const double* x, + const int incx, const double beta, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dsymv_p == NULL) cblas_dsymv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const int lda, - const double *x, const int incx, const double beta, double *y, + const double alpha, const double* a, const int lda, + const double* x, const int incx, const double beta, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dsymv"); if (cblas_dsymv_p != NULL) cblas_dsymv_p(layout, upper_lower, n, alpha, a, lda, x, incx, beta, y, incy); @@ -1297,12 +1297,12 @@ static void cblas_dsymv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_ssyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, float *a, + const float alpha, const float* x, const int incx, float* a, const int lda) { if (cblas_library() != NULL) { if (cblas_ssyr_p == NULL) cblas_ssyr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, float *a, + const float alpha, const float* x, const int incx, float* a, const int lda))GET_FUNC(h_libcblas, "cblas_ssyr"); if (cblas_ssyr_p != NULL) cblas_ssyr_p(layout, upper_lower, n, alpha, x, incx, a, lda); @@ -1310,12 +1310,12 @@ static void cblas_ssyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, cons } static void cblas_dsyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, double *a, + const double alpha, const double* x, const int incx, double* a, const int lda) { if (cblas_library() != NULL) { if (cblas_dsyr_p == NULL) cblas_dsyr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, double *a, + const double alpha, const double* x, const int incx, double* a, const int lda))GET_FUNC(h_libcblas, "cblas_dsyr"); if (cblas_dsyr_p != NULL) cblas_dsyr_p(layout, upper_lower, n, alpha, x, incx, a, lda); @@ -1323,13 +1323,13 @@ static void cblas_dsyr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, cons } static void cblas_ssyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, const float *y, - const int incy, float *a, const int lda) { + const float alpha, const float* x, const int incx, const float* y, + const int incy, float* a, const int lda) { if (cblas_library() != NULL) { if (cblas_ssyr2_p == NULL) cblas_ssyr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, - const float *y, const int incy, float *a, + const float alpha, const float* x, const int incx, + const float* y, const int incy, float* a, const int lda))GET_FUNC(h_libcblas, "cblas_ssyr2"); if (cblas_ssyr2_p != NULL) cblas_ssyr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda); @@ -1337,13 +1337,13 @@ static void cblas_ssyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_dsyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, - const double *y, const int incy, double *a, const int lda) { + const double alpha, const double* x, const int incx, + const double* y, const int incy, double* a, const int lda) { if (cblas_library() != NULL) { if (cblas_dsyr2_p == NULL) cblas_dsyr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, - const double *y, const int incy, double *a, + const double alpha, const double* x, const int incx, + const double* y, const int incy, double* a, const int lda))GET_FUNC(h_libcblas, "cblas_dsyr2"); if (cblas_dsyr2_p != NULL) cblas_dsyr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a, lda); @@ -1351,13 +1351,13 @@ static void cblas_dsyr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_sspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const float *x, const int incx, - const float beta, float *y, const int incy) { + const float alpha, const float* a, const float* x, const int incx, + const float beta, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_sspmv_p == NULL) cblas_sspmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *a, const float *x, - const int incx, const float beta, float *y, + const float alpha, const float* a, const float* x, + const int incx, const float beta, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sspmv"); if (cblas_sspmv_p != NULL) cblas_sspmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy); @@ -1365,13 +1365,13 @@ static void cblas_sspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_dspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const double *x, - const int incx, const double beta, double *y, const int incy) { + const double alpha, const double* a, const double* x, + const int incx, const double beta, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dspmv_p == NULL) cblas_dspmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *a, const double *x, - const int incx, const double beta, double *y, + const double alpha, const double* a, const double* x, + const int incx, const double beta, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dspmv"); if (cblas_dspmv_p != NULL) cblas_dspmv_p(layout, upper_lower, n, alpha, a, x, incx, beta, y, incy); @@ -1379,65 +1379,65 @@ static void cblas_dspmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, con } static void cblas_sspr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, float *a) { + const float alpha, const float* x, const int incx, float* a) { if (cblas_library() != NULL) { if (cblas_sspr_p == NULL) cblas_sspr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, - float *a))GET_FUNC(h_libcblas, "cblas_sspr"); + const float alpha, const float* x, const int incx, + float* a))GET_FUNC(h_libcblas, "cblas_sspr"); if (cblas_sspr_p != NULL) cblas_sspr_p(layout, upper_lower, n, alpha, x, incx, a); } } static void cblas_dspr_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, double *a) { + const double alpha, const double* x, const int incx, double* a) { if (cblas_library() != NULL) { if (cblas_dspr_p == NULL) cblas_dspr_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, - double *a))GET_FUNC(h_libcblas, "cblas_dspr"); + const double alpha, const double* x, const int incx, + double* a))GET_FUNC(h_libcblas, "cblas_dspr"); if (cblas_dspr_p != NULL) cblas_dspr_p(layout, upper_lower, n, alpha, x, incx, a); } } static void cblas_sspr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, const float *y, - const int incy, float *a) { + const float alpha, const float* x, const int incx, const float* y, + const int incy, float* a) { if (cblas_library() != NULL) { if (cblas_sspr2_p == NULL) cblas_sspr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const float alpha, const float *x, const int incx, const float *y, - const int incy, float *a))GET_FUNC(h_libcblas, "cblas_sspr2"); + const float alpha, const float* x, const int incx, const float* y, + const int incy, float* a))GET_FUNC(h_libcblas, "cblas_sspr2"); if (cblas_sspr2_p != NULL) cblas_sspr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a); } } static void cblas_dspr2_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, - const double *y, const int incy, double *a) { + const double alpha, const double* x, const int incx, + const double* y, const int incy, double* a) { if (cblas_library() != NULL) { if (cblas_dspr2_p == NULL) cblas_dspr2_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, const int n, - const double alpha, const double *x, const int incx, const double *y, - const int incy, double *a))GET_FUNC(h_libcblas, "cblas_dspr2"); + const double alpha, const double* x, const int incx, const double* y, + const int incy, double* a))GET_FUNC(h_libcblas, "cblas_dspr2"); if (cblas_dspr2_p != NULL) cblas_dspr2_p(layout, upper_lower, n, alpha, x, incx, y, incy, a); } } static void cblas_stbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const float *a, - const int lda, float *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const float* a, + const int lda, float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_stbmv_p == NULL) cblas_stbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const float *a, const int lda, float *x, + const int k, const float* a, const int lda, float* x, const int incx))GET_FUNC(h_libcblas, "cblas_stbmv"); if (cblas_stbmv_p != NULL) cblas_stbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1445,13 +1445,13 @@ static void cblas_stbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_dtbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const double *a, - const int lda, double *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const double* a, + const int lda, double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtbmv_p == NULL) cblas_dtbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const double *a, const int lda, double *x, + const int k, const double* a, const int lda, double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtbmv"); if (cblas_dtbmv_p != NULL) cblas_dtbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1459,13 +1459,13 @@ static void cblas_dtbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ctbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctbmv_p == NULL) cblas_ctbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const void *a, const int lda, void *x, + const int k, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctbmv"); if (cblas_ctbmv_p != NULL) cblas_ctbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1473,13 +1473,13 @@ static void cblas_ctbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztbmv_p == NULL) cblas_ztbmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const void *a, const int lda, void *x, + const int k, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztbmv"); if (cblas_ztbmv_p != NULL) cblas_ztbmv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1487,13 +1487,13 @@ static void cblas_ztbmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_stbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const float *a, - const int lda, float *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const float* a, + const int lda, float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_stbsv_p == NULL) cblas_stbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const float *a, const int lda, float *x, + const int k, const float* a, const int lda, float* x, const int incx))GET_FUNC(h_libcblas, "cblas_stbsv"); if (cblas_stbsv_p != NULL) cblas_stbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1501,13 +1501,13 @@ static void cblas_stbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_dtbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const double *a, - const int lda, double *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const double* a, + const int lda, double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtbsv_p == NULL) cblas_dtbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const double *a, const int lda, double *x, + const int k, const double* a, const int lda, double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtbsv"); if (cblas_dtbsv_p != NULL) cblas_dtbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1515,13 +1515,13 @@ static void cblas_dtbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ctbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctbsv_p == NULL) cblas_ctbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const void *a, const int lda, void *x, + const int k, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctbsv"); if (cblas_ctbsv_p != NULL) cblas_ctbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1529,13 +1529,13 @@ static void cblas_ctbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const int k, const void *a, - const int lda, void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const int k, const void* a, + const int lda, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztbsv_p == NULL) cblas_ztbsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, CBLAS_DIAG unit_diag, const int n, - const int k, const void *a, const int lda, void *x, + const int k, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztbsv"); if (cblas_ztbsv_p != NULL) cblas_ztbsv_p(layout, upper_lower, trans, unit_diag, n, k, a, lda, x, incx); @@ -1543,13 +1543,13 @@ static void cblas_ztbsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_stpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_stpmv_p == NULL) cblas_stpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx))GET_FUNC(h_libcblas, "cblas_stpmv"); if (cblas_stpmv_p != NULL) cblas_stpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1557,13 +1557,13 @@ static void cblas_stpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_dtpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtpmv_p == NULL) cblas_dtpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtpmv"); if (cblas_dtpmv_p != NULL) cblas_dtpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1571,13 +1571,13 @@ static void cblas_dtpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ctpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctpmv_p == NULL) cblas_ctpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctpmv"); if (cblas_ctpmv_p != NULL) cblas_ctpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1585,13 +1585,13 @@ static void cblas_ctpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztpmv_p == NULL) cblas_ztpmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztpmv"); if (cblas_ztpmv_p != NULL) cblas_ztpmv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1599,13 +1599,13 @@ static void cblas_ztpmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_stpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_stpsv_p == NULL) cblas_stpsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, float *x, + CBLAS_DIAG unit_diag, const int n, const float* a, float* x, const int incx))GET_FUNC(h_libcblas, "cblas_stpsv"); if (cblas_stpsv_p != NULL) cblas_stpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1613,13 +1613,13 @@ static void cblas_stpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_dtpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtpsv_p == NULL) cblas_dtpsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, double *x, + CBLAS_DIAG unit_diag, const int n, const double* a, double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtpsv"); if (cblas_dtpsv_p != NULL) cblas_dtpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1627,13 +1627,13 @@ static void cblas_dtpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ctpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctpsv_p == NULL) cblas_ctpsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctpsv"); if (cblas_ctpsv_p != NULL) cblas_ctpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1641,13 +1641,13 @@ static void cblas_ctpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztpsv_p == NULL) cblas_ztpsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztpsv"); if (cblas_ztpsv_p != NULL) cblas_ztpsv_p(layout, upper_lower, trans, unit_diag, n, a, x, incx); @@ -1655,41 +1655,41 @@ static void cblas_ztpsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_strmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_strmv_p == NULL) cblas_strmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx))GET_FUNC(h_libcblas, "cblas_strmv"); + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx))GET_FUNC(h_libcblas, "cblas_strmv"); if (cblas_strmv_p != NULL) cblas_strmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); } } static void cblas_dtrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtrmv_p == NULL) cblas_dtrmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrmv"); + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrmv"); if (cblas_dtrmv_p != NULL) cblas_dtrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); } } static void cblas_ctrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctrmv_p == NULL) cblas_ctrmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctrmv"); if (cblas_ctrmv_p != NULL) cblas_ctrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); @@ -1697,13 +1697,13 @@ static void cblas_ctrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztrmv_p == NULL) cblas_ztrmv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztrmv"); if (cblas_ztrmv_p != NULL) cblas_ztrmv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); @@ -1711,41 +1711,41 @@ static void cblas_ztrmv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_strsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_strsv_p == NULL) cblas_strsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const float *a, const int lda, - float *x, const int incx))GET_FUNC(h_libcblas, "cblas_strsv"); + CBLAS_DIAG unit_diag, const int n, const float* a, const int lda, + float* x, const int incx))GET_FUNC(h_libcblas, "cblas_strsv"); if (cblas_strsv_p != NULL) cblas_strsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); } } static void cblas_dtrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dtrsv_p == NULL) cblas_dtrsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const double *a, const int lda, - double *x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrsv"); + CBLAS_DIAG unit_diag, const int n, const double* a, const int lda, + double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dtrsv"); if (cblas_dtrsv_p != NULL) cblas_dtrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); } } static void cblas_ctrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ctrsv_p == NULL) cblas_ctrsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ctrsv"); if (cblas_ctrsv_p != NULL) cblas_ctrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); @@ -1753,13 +1753,13 @@ static void cblas_ctrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL } static void cblas_ztrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, - void *x, const int incx) { + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, + void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_ztrsv_p == NULL) cblas_ztrsv_p = (void (*)(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBLAS_TRANSPOSE trans, - CBLAS_DIAG unit_diag, const int n, const void *a, const int lda, void *x, + CBLAS_DIAG unit_diag, const int n, const void* a, const int lda, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_ztrsv"); if (cblas_ztrsv_p != NULL) cblas_ztrsv_p(layout, upper_lower, trans, unit_diag, n, a, lda, x, incx); @@ -1768,81 +1768,81 @@ static void cblas_ztrsv_wrapper(CBLAS_LAYOUT layout, CBLAS_UPLO upper_lower, CBL /* Level 1 */ -static float (*cblas_sasum_p)(const int n, const float *x, const int incx); -static double (*cblas_dasum_p)(const int n, const double *x, const int incx); -static float (*cblas_scasum_p)(const int n, const void *x, const int incx); -static double (*cblas_dzasum_p)(const int n, const void *x, const int incx); -static void (*cblas_saxpy_p)(const int n, const float alpha, const float *x, const int incx, - float *y, const int incy); -static void (*cblas_daxpy_p)(const int n, const double alpha, const double *x, const int incx, - double *y, const int incy); -static void (*cblas_caxpy_p)(const int n, const void *alpha, const void *x, const int incx, void *y, +static float (*cblas_sasum_p)(const int n, const float* x, const int incx); +static double (*cblas_dasum_p)(const int n, const double* x, const int incx); +static float (*cblas_scasum_p)(const int n, const void* x, const int incx); +static double (*cblas_dzasum_p)(const int n, const void* x, const int incx); +static void (*cblas_saxpy_p)(const int n, const float alpha, const float* x, const int incx, + float* y, const int incy); +static void (*cblas_daxpy_p)(const int n, const double alpha, const double* x, const int incx, + double* y, const int incy); +static void (*cblas_caxpy_p)(const int n, const void* alpha, const void* x, const int incx, void* y, const int incy); -static void (*cblas_zaxpy_p)(const int n, const void *alpha, const void *x, const int incx, void *y, +static void (*cblas_zaxpy_p)(const int n, const void* alpha, const void* x, const int incx, void* y, const int incy); -static void (*cblas_scopy_p)(const int n, const float *x, const int incx, float *y, const int incy); -static void (*cblas_dcopy_p)(const int n, const double *x, const int incx, double *y, +static void (*cblas_scopy_p)(const int n, const float* x, const int incx, float* y, const int incy); +static void (*cblas_dcopy_p)(const int n, const double* x, const int incx, double* y, const int incy); -static void (*cblas_ccopy_p)(const int n, const void *x, const int incx, void *y, const int incy); -static void (*cblas_zcopy_p)(const int n, const void *x, const int incx, void *y, const int incy); -static float (*cblas_sdot_p)(const int n, const float *x, const int incx, const float *y, +static void (*cblas_ccopy_p)(const int n, const void* x, const int incx, void* y, const int incy); +static void (*cblas_zcopy_p)(const int n, const void* x, const int incx, void* y, const int incy); +static float (*cblas_sdot_p)(const int n, const float* x, const int incx, const float* y, const int incy); -static double (*cblas_ddot_p)(const int n, const double *x, const int incx, const double *y, +static double (*cblas_ddot_p)(const int n, const double* x, const int incx, const double* y, const int incy); -static double (*cblas_dsdot_p)(const int n, const float *x, const int incx, const float *y, +static double (*cblas_dsdot_p)(const int n, const float* x, const int incx, const float* y, const int incy); -static float (*cblas_sdsdot_p)(const int n, const float sb, const float *x, const int incx, - const float *y, const int incy); -static float (*cblas_snrm2_p)(const int n, const float *x, const int incx); -static double (*cblas_dnrm2_p)(const int n, const double *x, const int incx); -static float (*cblas_scnrm2_p)(const int n, const void *x, const int incx); -static double (*cblas_dznrm2_p)(const int n, const void *x, const int incx); -static void (*cblas_srot_p)(const int n, float *x, const int incx, float *y, const int incy, +static float (*cblas_sdsdot_p)(const int n, const float sb, const float* x, const int incx, + const float* y, const int incy); +static float (*cblas_snrm2_p)(const int n, const float* x, const int incx); +static double (*cblas_dnrm2_p)(const int n, const double* x, const int incx); +static float (*cblas_scnrm2_p)(const int n, const void* x, const int incx); +static double (*cblas_dznrm2_p)(const int n, const void* x, const int incx); +static void (*cblas_srot_p)(const int n, float* x, const int incx, float* y, const int incy, const float c, const float s); -static void (*cblas_drot_p)(const int n, double *x, const int incx, double *y, const int incy, +static void (*cblas_drot_p)(const int n, double* x, const int incx, double* y, const int incy, const double c, const double s); -static void (*csrot_p)(const int *n, void *x, const int *incx, void *y, const int *incy, - const float *c, const float *s); -static void (*zdrot_p)(const int *n, void *x, const int *incx, void *y, const int *incy, - const double *c, const double *s); -static void (*cblas_srotg_p)(float *a, float *b, float *c, float *s); -static void (*cblas_drotg_p)(double *a, double *b, double *c, double *s); -static void (*crotg_p)(void *a, void *b, float *c, void *s); -static void (*zrotg_p)(void *a, void *b, double *c, void *s); -static void (*cblas_srotm_p)(const int n, float *x, const int incx, float *y, const int incy, - const float *param); -static void (*cblas_drotm_p)(const int n, double *x, const int incx, double *y, const int incy, - const double *param); -static void (*cblas_srotmg_p)(float *d1, float *d2, float *x1, float y1, float *param); -static void (*cblas_drotmg_p)(double *d1, double *d2, double *x1, double y1, double *param); -static void (*cblas_sscal_p)(const int n, const float alpha, float *x, const int incx); -static void (*cblas_dscal_p)(const int n, const double alpha, double *x, const int incx); -static void (*cblas_cscal_p)(const int n, const void *alpha, void *x, const int incx); -static void (*cblas_zscal_p)(const int n, const void *alpha, void *x, const int incx); -static void (*cblas_csscal_p)(const int n, const float alpha, void *x, const int incx); -static void (*cblas_zdscal_p)(const int n, const double alpha, void *x, const int incx); -static void (*cblas_sswap_p)(const int n, float *x, const int incx, float *y, const int incy); -static void (*cblas_dswap_p)(const int n, double *x, const int incx, double *y, const int incy); -static void (*cblas_cswap_p)(const int n, void *x, const int incx, void *y, const int incy); -static void (*cblas_zswap_p)(const int n, void *x, const int incx, void *y, const int incy); -static void (*cblas_cdotc_sub_p)(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres); -static void (*cblas_zdotc_sub_p)(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres); -static void (*cblas_cdotu_sub_p)(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres); -static void (*cblas_zdotu_sub_p)(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres); -static int (*cblas_isamax_p)(const int n, const float *x, const int incx); -static int (*cblas_idamax_p)(const int n, const double *x, const int incx); -static int (*cblas_icamax_p)(const int n, const void *x, const int incx); -static int (*cblas_izamax_p)(const int n, const void *x, const int incx); - -static float cblas_sasum_wrapper(const int n, const float *x, const int incx) { +static void (*csrot_p)(const int* n, void* x, const int* incx, void* y, const int* incy, + const float* c, const float* s); +static void (*zdrot_p)(const int* n, void* x, const int* incx, void* y, const int* incy, + const double* c, const double* s); +static void (*cblas_srotg_p)(float* a, float* b, float* c, float* s); +static void (*cblas_drotg_p)(double* a, double* b, double* c, double* s); +static void (*crotg_p)(void* a, void* b, float* c, void* s); +static void (*zrotg_p)(void* a, void* b, double* c, void* s); +static void (*cblas_srotm_p)(const int n, float* x, const int incx, float* y, const int incy, + const float* param); +static void (*cblas_drotm_p)(const int n, double* x, const int incx, double* y, const int incy, + const double* param); +static void (*cblas_srotmg_p)(float* d1, float* d2, float* x1, float y1, float* param); +static void (*cblas_drotmg_p)(double* d1, double* d2, double* x1, double y1, double* param); +static void (*cblas_sscal_p)(const int n, const float alpha, float* x, const int incx); +static void (*cblas_dscal_p)(const int n, const double alpha, double* x, const int incx); +static void (*cblas_cscal_p)(const int n, const void* alpha, void* x, const int incx); +static void (*cblas_zscal_p)(const int n, const void* alpha, void* x, const int incx); +static void (*cblas_csscal_p)(const int n, const float alpha, void* x, const int incx); +static void (*cblas_zdscal_p)(const int n, const double alpha, void* x, const int incx); +static void (*cblas_sswap_p)(const int n, float* x, const int incx, float* y, const int incy); +static void (*cblas_dswap_p)(const int n, double* x, const int incx, double* y, const int incy); +static void (*cblas_cswap_p)(const int n, void* x, const int incx, void* y, const int incy); +static void (*cblas_zswap_p)(const int n, void* x, const int incx, void* y, const int incy); +static void (*cblas_cdotc_sub_p)(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres); +static void (*cblas_zdotc_sub_p)(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres); +static void (*cblas_cdotu_sub_p)(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres); +static void (*cblas_zdotu_sub_p)(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres); +static int (*cblas_isamax_p)(const int n, const float* x, const int incx); +static int (*cblas_idamax_p)(const int n, const double* x, const int incx); +static int (*cblas_icamax_p)(const int n, const void* x, const int incx); +static int (*cblas_izamax_p)(const int n, const void* x, const int incx); + +static float cblas_sasum_wrapper(const int n, const float* x, const int incx) { float sasum_res = 0.0f; if (cblas_library() != NULL) { if (cblas_sasum_p == NULL) - cblas_sasum_p = (float (*)(const int n, const float *x, const int incx))GET_FUNC( + cblas_sasum_p = (float (*)(const int n, const float* x, const int incx))GET_FUNC( h_libcblas, "cblas_sasum"); if (cblas_sasum_p != NULL) sasum_res = cblas_sasum_p(n, x, incx); @@ -1850,11 +1850,11 @@ static float cblas_sasum_wrapper(const int n, const float *x, const int incx) { return sasum_res; } -static double cblas_dasum_wrapper(const int n, const double *x, const int incx) { +static double cblas_dasum_wrapper(const int n, const double* x, const int incx) { double dasum_res = 0.0; if (cblas_library() != NULL) { if (cblas_dasum_p == NULL) - cblas_dasum_p = (double (*)(const int n, const double *x, const int incx))GET_FUNC( + cblas_dasum_p = (double (*)(const int n, const double* x, const int incx))GET_FUNC( h_libcblas, "cblas_dasum"); if (cblas_dasum_p != NULL) dasum_res = cblas_dasum_p(n, x, incx); @@ -1862,11 +1862,11 @@ static double cblas_dasum_wrapper(const int n, const double *x, const int incx) return dasum_res; } -static float cblas_scasum_wrapper(const int n, const void *x, const int incx) { +static float cblas_scasum_wrapper(const int n, const void* x, const int incx) { float scasum_res = 0.0f; if (cblas_library() != NULL) { if (cblas_scasum_p == NULL) - cblas_scasum_p = (float (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_scasum_p = (float (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_scasum"); if (cblas_scasum_p != NULL) scasum_res = cblas_scasum_p(n, x, incx); @@ -1874,11 +1874,11 @@ static float cblas_scasum_wrapper(const int n, const void *x, const int incx) { return scasum_res; } -static double cblas_dzasum_wrapper(const int n, const void *x, const int incx) { +static double cblas_dzasum_wrapper(const int n, const void* x, const int incx) { double dzasum_res = 0.0; if (cblas_library() != NULL) { if (cblas_dzasum_p == NULL) - cblas_dzasum_p = (double (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_dzasum_p = (double (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_dzasum"); if (cblas_dzasum_p != NULL) dzasum_res = cblas_dzasum_p(n, x, incx); @@ -1886,102 +1886,102 @@ static double cblas_dzasum_wrapper(const int n, const void *x, const int incx) { return dzasum_res; } -static void cblas_saxpy_wrapper(const int n, const float alpha, const float *x, const int incx, - float *y, const int incy) { +static void cblas_saxpy_wrapper(const int n, const float alpha, const float* x, const int incx, + float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_saxpy_p == NULL) cblas_saxpy_p = - (void (*)(const int n, const float alpha, const float *x, const int incx, float *y, + (void (*)(const int n, const float alpha, const float* x, const int incx, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_saxpy"); if (cblas_saxpy_p != NULL) cblas_saxpy_p(n, alpha, x, incx, y, incy); } } -static void cblas_daxpy_wrapper(const int n, const double alpha, const double *x, const int incx, - double *y, const int incy) { +static void cblas_daxpy_wrapper(const int n, const double alpha, const double* x, const int incx, + double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_daxpy_p == NULL) cblas_daxpy_p = - (void (*)(const int n, const double alpha, const double *x, const int incx, - double *y, const int incy))GET_FUNC(h_libcblas, "cblas_daxpy"); + (void (*)(const int n, const double alpha, const double* x, const int incx, + double* y, const int incy))GET_FUNC(h_libcblas, "cblas_daxpy"); if (cblas_daxpy_p != NULL) cblas_daxpy_p(n, alpha, x, incx, y, incy); } } -static void cblas_caxpy_wrapper(const int n, const void *alpha, const void *x, const int incx, - void *y, const int incy) { +static void cblas_caxpy_wrapper(const int n, const void* alpha, const void* x, const int incx, + void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_caxpy_p == NULL) - cblas_caxpy_p = (void (*)(const int n, const void *alpha, const void *x, const int incx, - void *y, const int incy))GET_FUNC(h_libcblas, "cblas_caxpy"); + cblas_caxpy_p = (void (*)(const int n, const void* alpha, const void* x, const int incx, + void* y, const int incy))GET_FUNC(h_libcblas, "cblas_caxpy"); if (cblas_caxpy_p != NULL) cblas_caxpy_p(n, alpha, x, incx, y, incy); } } -static void cblas_zaxpy_wrapper(const int n, const void *alpha, const void *x, const int incx, - void *y, const int incy) { +static void cblas_zaxpy_wrapper(const int n, const void* alpha, const void* x, const int incx, + void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zaxpy_p == NULL) - cblas_zaxpy_p = (void (*)(const int n, const void *alpha, const void *x, const int incx, - void *y, const int incy))GET_FUNC(h_libcblas, "cblas_zaxpy"); + cblas_zaxpy_p = (void (*)(const int n, const void* alpha, const void* x, const int incx, + void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zaxpy"); if (cblas_zaxpy_p != NULL) cblas_zaxpy_p(n, alpha, x, incx, y, incy); } } -static void cblas_scopy_wrapper(const int n, const float *x, const int incx, float *y, +static void cblas_scopy_wrapper(const int n, const float* x, const int incx, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_scopy_p == NULL) - cblas_scopy_p = (void (*)(const int n, const float *x, const int incx, float *y, + cblas_scopy_p = (void (*)(const int n, const float* x, const int incx, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_scopy"); if (cblas_scopy_p != NULL) cblas_scopy_p(n, x, incx, y, incy); } } -static void cblas_dcopy_wrapper(const int n, const double *x, const int incx, double *y, +static void cblas_dcopy_wrapper(const int n, const double* x, const int incx, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dcopy_p == NULL) - cblas_dcopy_p = (void (*)(const int n, const double *x, const int incx, double *y, + cblas_dcopy_p = (void (*)(const int n, const double* x, const int incx, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dcopy"); if (cblas_dcopy_p != NULL) cblas_dcopy_p(n, x, incx, y, incy); } } -static void cblas_ccopy_wrapper(const int n, const void *x, const int incx, void *y, +static void cblas_ccopy_wrapper(const int n, const void* x, const int incx, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_ccopy_p == NULL) - cblas_ccopy_p = (void (*)(const int n, const void *x, const int incx, void *y, + cblas_ccopy_p = (void (*)(const int n, const void* x, const int incx, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_ccopy"); if (cblas_ccopy_p != NULL) cblas_ccopy_p(n, x, incx, y, incy); } } -static void cblas_zcopy_wrapper(const int n, const void *x, const int incx, void *y, +static void cblas_zcopy_wrapper(const int n, const void* x, const int incx, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zcopy_p == NULL) - cblas_zcopy_p = (void (*)(const int n, const void *x, const int incx, void *y, + cblas_zcopy_p = (void (*)(const int n, const void* x, const int incx, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zcopy"); if (cblas_zcopy_p != NULL) cblas_zcopy_p(n, x, incx, y, incy); } } -static float cblas_sdot_wrapper(const int n, const float *x, const int incx, const float *y, +static float cblas_sdot_wrapper(const int n, const float* x, const int incx, const float* y, const int incy) { float sdot_res = 0.0f; if (cblas_library() != NULL) { if (cblas_sdot_p == NULL) - cblas_sdot_p = (float (*)(const int n, const float *x, const int incx, const float *y, + cblas_sdot_p = (float (*)(const int n, const float* x, const int incx, const float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sdot"); if (cblas_sdot_p != NULL) sdot_res = cblas_sdot_p(n, x, incx, y, incy); @@ -1989,13 +1989,13 @@ static float cblas_sdot_wrapper(const int n, const float *x, const int incx, con return sdot_res; } -static double cblas_ddot_wrapper(const int n, const double *x, const int incx, const double *y, +static double cblas_ddot_wrapper(const int n, const double* x, const int incx, const double* y, const int incy) { double ddot_res = 0.0; if (cblas_library() != NULL) { if (cblas_ddot_p == NULL) cblas_ddot_p = - (double (*)(const int n, const double *x, const int incx, const double *y, + (double (*)(const int n, const double* x, const int incx, const double* y, const int incy))GET_FUNC(h_libcblas, "cblas_ddot"); if (cblas_ddot_p != NULL) ddot_res = cblas_ddot_p(n, x, incx, y, incy); @@ -2003,12 +2003,12 @@ static double cblas_ddot_wrapper(const int n, const double *x, const int incx, c return ddot_res; } -static double cblas_dsdot_wrapper(const int n, const float *x, const int incx, const float *y, +static double cblas_dsdot_wrapper(const int n, const float* x, const int incx, const float* y, const int incy) { double dsdot_res = 0.0; if (cblas_library() != NULL) { if (cblas_dsdot_p == NULL) - cblas_dsdot_p = (double (*)(const int n, const float *x, const int incx, const float *y, + cblas_dsdot_p = (double (*)(const int n, const float* x, const int incx, const float* y, const int incy))GET_FUNC(h_libcblas, "cblas_dsdot"); if (cblas_dsdot_p != NULL) dsdot_res = cblas_dsdot_p(n, x, incx, y, incy); @@ -2016,25 +2016,25 @@ static double cblas_dsdot_wrapper(const int n, const float *x, const int incx, c return dsdot_res; } -static float cblas_sdsdot_wrapper(const int n, const float sb, const float *x, const int incx, - const float *y, const int incy) { +static float cblas_sdsdot_wrapper(const int n, const float sb, const float* x, const int incx, + const float* y, const int incy) { float sdsdot_res = 0.0f; if (cblas_library() != NULL) { if (cblas_sdsdot_p == NULL) cblas_sdsdot_p = - (float (*)(const int n, const float sb, const float *x, const int incx, - const float *y, const int incy))GET_FUNC(h_libcblas, "cblas_sdsdot"); + (float (*)(const int n, const float sb, const float* x, const int incx, + const float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sdsdot"); if (cblas_sdsdot_p != NULL) sdsdot_res = cblas_sdsdot_p(n, sb, x, incx, y, incy); } return sdsdot_res; } -static float cblas_snrm2_wrapper(const int n, const float *x, const int incx) { +static float cblas_snrm2_wrapper(const int n, const float* x, const int incx) { float snrm2_res = 0.0f; if (cblas_library() != NULL) { if (cblas_snrm2_p == NULL) - cblas_snrm2_p = (float (*)(const int n, const float *x, const int incx))GET_FUNC( + cblas_snrm2_p = (float (*)(const int n, const float* x, const int incx))GET_FUNC( h_libcblas, "cblas_snrm2"); if (cblas_snrm2_p != NULL) snrm2_res = cblas_snrm2_p(n, x, incx); @@ -2042,11 +2042,11 @@ static float cblas_snrm2_wrapper(const int n, const float *x, const int incx) { return snrm2_res; } -static double cblas_dnrm2_wrapper(const int n, const double *x, const int incx) { +static double cblas_dnrm2_wrapper(const int n, const double* x, const int incx) { double dnrm2_res = 0.0; if (cblas_library() != NULL) { if (cblas_dnrm2_p == NULL) - cblas_dnrm2_p = (double (*)(const int n, const double *x, const int incx))GET_FUNC( + cblas_dnrm2_p = (double (*)(const int n, const double* x, const int incx))GET_FUNC( h_libcblas, "cblas_dnrm2"); if (cblas_dnrm2_p != NULL) dnrm2_res = cblas_dnrm2_p(n, x, incx); @@ -2054,11 +2054,11 @@ static double cblas_dnrm2_wrapper(const int n, const double *x, const int incx) return dnrm2_res; } -static float cblas_scnrm2_wrapper(const int n, const void *x, const int incx) { +static float cblas_scnrm2_wrapper(const int n, const void* x, const int incx) { float scnrm2_res = 0.0f; if (cblas_library() != NULL) { if (cblas_scnrm2_p == NULL) - cblas_scnrm2_p = (float (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_scnrm2_p = (float (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_scnrm2"); if (cblas_scnrm2_p != NULL) scnrm2_res = cblas_scnrm2_p(n, x, incx); @@ -2066,11 +2066,11 @@ static float cblas_scnrm2_wrapper(const int n, const void *x, const int incx) { return scnrm2_res; } -static double cblas_dznrm2_wrapper(const int n, const void *x, const int incx) { +static double cblas_dznrm2_wrapper(const int n, const void* x, const int incx) { double dznrm2_res = 0.0; if (cblas_library() != NULL) { if (cblas_dznrm2_p == NULL) - cblas_dznrm2_p = (double (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_dznrm2_p = (double (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_dznrm2"); if (cblas_dznrm2_p != NULL) dznrm2_res = cblas_dznrm2_p(n, x, incx); @@ -2078,297 +2078,297 @@ static double cblas_dznrm2_wrapper(const int n, const void *x, const int incx) { return dznrm2_res; } -static void cblas_srot_wrapper(const int n, float *x, const int incx, float *y, const int incy, +static void cblas_srot_wrapper(const int n, float* x, const int incx, float* y, const int incy, const float c, const float s) { if (cblas_library() != NULL) { if (cblas_srot_p == NULL) cblas_srot_p = - (void (*)(const int n, float *x, const int incx, float *y, const int incy, + (void (*)(const int n, float* x, const int incx, float* y, const int incy, const float c, const float s))GET_FUNC(h_libcblas, "cblas_srot"); if (cblas_srot_p != NULL) cblas_srot_p(n, x, incx, y, incy, c, s); } } -static void cblas_drot_wrapper(const int n, double *x, const int incx, double *y, const int incy, +static void cblas_drot_wrapper(const int n, double* x, const int incx, double* y, const int incy, const double c, const double s) { if (cblas_library() != NULL) { if (cblas_drot_p == NULL) cblas_drot_p = - (void (*)(const int n, double *x, const int incx, double *y, const int incy, + (void (*)(const int n, double* x, const int incx, double* y, const int incy, const double c, const double s))GET_FUNC(h_libcblas, "cblas_drot"); if (cblas_drot_p != NULL) cblas_drot_p(n, x, incx, y, incy, c, s); } } -static void csrot_wrapper(const int *n, void *x, const int *incx, void *y, const int *incy, - const float *c, const float *s) { +static void csrot_wrapper(const int* n, void* x, const int* incx, void* y, const int* incy, + const float* c, const float* s) { if (blas_library() != NULL) { if (csrot_p == NULL) - csrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy, - const float *c, const float *s))GET_FUNC(h_libblas, "csrot_"); + csrot_p = (void (*)(const int* n, void* x, const int* incx, void* y, const int* incy, + const float* c, const float* s))GET_FUNC(h_libblas, "csrot_"); if (csrot_p == NULL) - csrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy, - const float *c, const float *s))GET_FUNC(h_libblas, "CSROT"); + csrot_p = (void (*)(const int* n, void* x, const int* incx, void* y, const int* incy, + const float* c, const float* s))GET_FUNC(h_libblas, "CSROT"); if (csrot_p != NULL) csrot_p(n, x, incx, y, incy, c, s); } } -static void zdrot_wrapper(const int *n, void *x, const int *incx, void *y, const int *incy, - const double *c, const double *s) { +static void zdrot_wrapper(const int* n, void* x, const int* incx, void* y, const int* incy, + const double* c, const double* s) { if (blas_library() != NULL) { if (zdrot_p == NULL) - zdrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy, - const double *c, const double *s))GET_FUNC(h_libblas, "zdrot_"); + zdrot_p = (void (*)(const int* n, void* x, const int* incx, void* y, const int* incy, + const double* c, const double* s))GET_FUNC(h_libblas, "zdrot_"); if (zdrot_p == NULL) - zdrot_p = (void (*)(const int *n, void *x, const int *incx, void *y, const int *incy, - const double *c, const double *s))GET_FUNC(h_libblas, "ZDROT"); + zdrot_p = (void (*)(const int* n, void* x, const int* incx, void* y, const int* incy, + const double* c, const double* s))GET_FUNC(h_libblas, "ZDROT"); if (zdrot_p != NULL) zdrot_p(n, x, incx, y, incy, c, s); } } -static void cblas_srotg_wrapper(float *a, float *b, float *c, float *s) { +static void cblas_srotg_wrapper(float* a, float* b, float* c, float* s) { if (cblas_library() != NULL) { if (cblas_srotg_p == NULL) - cblas_srotg_p = (void (*)(float *a, float *b, float *c, float *s))GET_FUNC( + cblas_srotg_p = (void (*)(float* a, float* b, float* c, float* s))GET_FUNC( h_libcblas, "cblas_srotg"); if (cblas_srotg_p != NULL) cblas_srotg_p(a, b, c, s); } } -static void cblas_drotg_wrapper(double *a, double *b, double *c, double *s) { +static void cblas_drotg_wrapper(double* a, double* b, double* c, double* s) { if (cblas_library() != NULL) { if (cblas_drotg_p == NULL) - cblas_drotg_p = (void (*)(double *a, double *b, double *c, double *s))GET_FUNC( + cblas_drotg_p = (void (*)(double* a, double* b, double* c, double* s))GET_FUNC( h_libcblas, "cblas_drotg"); if (cblas_drotg_p != NULL) cblas_drotg_p(a, b, c, s); } } -static void crotg_wrapper(void *a, void *b, float *c, void *s) { +static void crotg_wrapper(void* a, void* b, float* c, void* s) { if (blas_library() != NULL) { if (crotg_p == NULL) - crotg_p = (void (*)(void *a, void *b, float *c, void *s))GET_FUNC(h_libblas, "crotg_"); + crotg_p = (void (*)(void* a, void* b, float* c, void* s))GET_FUNC(h_libblas, "crotg_"); if (crotg_p == NULL) - crotg_p = (void (*)(void *a, void *b, float *c, void *s))GET_FUNC(h_libblas, "CROTG"); + crotg_p = (void (*)(void* a, void* b, float* c, void* s))GET_FUNC(h_libblas, "CROTG"); if (crotg_p != NULL) crotg_p(a, b, c, s); } } -static void zrotg_wrapper(void *a, void *b, double *c, void *s) { +static void zrotg_wrapper(void* a, void* b, double* c, void* s) { if (blas_library() != NULL) { if (zrotg_p == NULL) - zrotg_p = (void (*)(void *a, void *b, double *c, void *s))GET_FUNC(h_libblas, "zrotg_"); + zrotg_p = (void (*)(void* a, void* b, double* c, void* s))GET_FUNC(h_libblas, "zrotg_"); if (zrotg_p == NULL) - zrotg_p = (void (*)(void *a, void *b, double *c, void *s))GET_FUNC(h_libblas, "ZROTG"); + zrotg_p = (void (*)(void* a, void* b, double* c, void* s))GET_FUNC(h_libblas, "ZROTG"); if (zrotg_p != NULL) zrotg_p(a, b, c, s); } } -static void cblas_srotm_wrapper(const int n, float *x, const int incx, float *y, const int incy, - const float *param) { +static void cblas_srotm_wrapper(const int n, float* x, const int incx, float* y, const int incy, + const float* param) { if (cblas_library() != NULL) { if (cblas_srotm_p == NULL) cblas_srotm_p = - (void (*)(const int n, float *x, const int incx, float *y, const int incy, - const float *param))GET_FUNC(h_libcblas, "cblas_srotm"); + (void (*)(const int n, float* x, const int incx, float* y, const int incy, + const float* param))GET_FUNC(h_libcblas, "cblas_srotm"); if (cblas_srotm_p != NULL) cblas_srotm_p(n, x, incx, y, incy, param); } } -static void cblas_drotm_wrapper(const int n, double *x, const int incx, double *y, const int incy, - const double *param) { +static void cblas_drotm_wrapper(const int n, double* x, const int incx, double* y, const int incy, + const double* param) { if (cblas_library() != NULL) { if (cblas_drotm_p == NULL) cblas_drotm_p = - (void (*)(const int n, double *x, const int incx, double *y, const int incy, - const double *param))GET_FUNC(h_libcblas, "cblas_drotm"); + (void (*)(const int n, double* x, const int incx, double* y, const int incy, + const double* param))GET_FUNC(h_libcblas, "cblas_drotm"); if (cblas_drotm_p != NULL) cblas_drotm_p(n, x, incx, y, incy, param); } } -static void cblas_srotmg_wrapper(float *d1, float *d2, float *x1, float y1, float *param) { +static void cblas_srotmg_wrapper(float* d1, float* d2, float* x1, float y1, float* param) { if (cblas_library() != NULL) { if (cblas_srotmg_p == NULL) - cblas_srotmg_p = (void (*)(float *d1, float *d2, float *x1, float y1, - float *param))GET_FUNC(h_libcblas, "cblas_srotmg"); + cblas_srotmg_p = (void (*)(float* d1, float* d2, float* x1, float y1, + float* param))GET_FUNC(h_libcblas, "cblas_srotmg"); if (cblas_srotmg_p != NULL) cblas_srotmg_p(d1, d2, x1, y1, param); } } -static void cblas_drotmg_wrapper(double *d1, double *d2, double *x1, double y1, double *param) { +static void cblas_drotmg_wrapper(double* d1, double* d2, double* x1, double y1, double* param) { if (cblas_library() != NULL) { if (cblas_drotmg_p == NULL) - cblas_drotmg_p = (void (*)(double *d1, double *d2, double *x1, double y1, - double *param))GET_FUNC(h_libcblas, "cblas_drotmg"); + cblas_drotmg_p = (void (*)(double* d1, double* d2, double* x1, double y1, + double* param))GET_FUNC(h_libcblas, "cblas_drotmg"); if (cblas_drotmg_p != NULL) cblas_drotmg_p(d1, d2, x1, y1, param); } } -static void cblas_sscal_wrapper(const int n, const float alpha, float *x, const int incx) { +static void cblas_sscal_wrapper(const int n, const float alpha, float* x, const int incx) { if (cblas_library() != NULL) { if (cblas_sscal_p == NULL) - cblas_sscal_p = (void (*)(const int n, const float alpha, float *x, + cblas_sscal_p = (void (*)(const int n, const float alpha, float* x, const int incx))GET_FUNC(h_libcblas, "cblas_sscal"); if (cblas_sscal_p != NULL) cblas_sscal_p(n, alpha, x, incx); } } -static void cblas_dscal_wrapper(const int n, const double alpha, double *x, const int incx) { +static void cblas_dscal_wrapper(const int n, const double alpha, double* x, const int incx) { if (cblas_library() != NULL) { if (cblas_dscal_p == NULL) - cblas_dscal_p = (void (*)(const int n, const double alpha, double *x, + cblas_dscal_p = (void (*)(const int n, const double alpha, double* x, const int incx))GET_FUNC(h_libcblas, "cblas_dscal"); if (cblas_dscal_p != NULL) cblas_dscal_p(n, alpha, x, incx); } } -static void cblas_cscal_wrapper(const int n, const void *alpha, void *x, const int incx) { +static void cblas_cscal_wrapper(const int n, const void* alpha, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_cscal_p == NULL) - cblas_cscal_p = (void (*)(const int n, const void *alpha, void *x, + cblas_cscal_p = (void (*)(const int n, const void* alpha, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_cscal"); if (cblas_cscal_p != NULL) cblas_cscal_p(n, alpha, x, incx); } } -static void cblas_zscal_wrapper(const int n, const void *alpha, void *x, const int incx) { +static void cblas_zscal_wrapper(const int n, const void* alpha, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_zscal_p == NULL) - cblas_zscal_p = (void (*)(const int n, const void *alpha, void *x, + cblas_zscal_p = (void (*)(const int n, const void* alpha, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_zscal"); if (cblas_zscal_p != NULL) cblas_zscal_p(n, alpha, x, incx); } } -static void cblas_csscal_wrapper(const int n, const float alpha, void *x, const int incx) { +static void cblas_csscal_wrapper(const int n, const float alpha, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_csscal_p == NULL) - cblas_csscal_p = (void (*)(const int n, const float alpha, void *x, + cblas_csscal_p = (void (*)(const int n, const float alpha, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_csscal"); if (cblas_csscal_p != NULL) cblas_csscal_p(n, alpha, x, incx); } } -static void cblas_zdscal_wrapper(const int n, const double alpha, void *x, const int incx) { +static void cblas_zdscal_wrapper(const int n, const double alpha, void* x, const int incx) { if (cblas_library() != NULL) { if (cblas_zdscal_p == NULL) - cblas_zdscal_p = (void (*)(const int n, const double alpha, void *x, + cblas_zdscal_p = (void (*)(const int n, const double alpha, void* x, const int incx))GET_FUNC(h_libcblas, "cblas_zdscal"); if (cblas_zdscal_p != NULL) cblas_zdscal_p(n, alpha, x, incx); } } -static void cblas_sswap_wrapper(const int n, float *x, const int incx, float *y, const int incy) { +static void cblas_sswap_wrapper(const int n, float* x, const int incx, float* y, const int incy) { if (cblas_library() != NULL) { if (cblas_sswap_p == NULL) - cblas_sswap_p = (void (*)(const int n, float *x, const int incx, float *y, + cblas_sswap_p = (void (*)(const int n, float* x, const int incx, float* y, const int incy))GET_FUNC(h_libcblas, "cblas_sswap"); if (cblas_sswap_p != NULL) cblas_sswap_p(n, x, incx, y, incy); } } -static void cblas_dswap_wrapper(const int n, double *x, const int incx, double *y, const int incy) { +static void cblas_dswap_wrapper(const int n, double* x, const int incx, double* y, const int incy) { if (cblas_library() != NULL) { if (cblas_dswap_p == NULL) - cblas_dswap_p = (void (*)(const int n, double *x, const int incx, double *y, + cblas_dswap_p = (void (*)(const int n, double* x, const int incx, double* y, const int incy))GET_FUNC(h_libcblas, "cblas_dswap"); if (cblas_dswap_p != NULL) cblas_dswap_p(n, x, incx, y, incy); } } -static void cblas_cswap_wrapper(const int n, void *x, const int incx, void *y, const int incy) { +static void cblas_cswap_wrapper(const int n, void* x, const int incx, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_cswap_p == NULL) - cblas_cswap_p = (void (*)(const int n, void *x, const int incx, void *y, + cblas_cswap_p = (void (*)(const int n, void* x, const int incx, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_cswap"); if (cblas_cswap_p != NULL) cblas_cswap_p(n, x, incx, y, incy); } } -static void cblas_zswap_wrapper(const int n, void *x, const int incx, void *y, const int incy) { +static void cblas_zswap_wrapper(const int n, void* x, const int incx, void* y, const int incy) { if (cblas_library() != NULL) { if (cblas_zswap_p == NULL) - cblas_zswap_p = (void (*)(const int n, void *x, const int incx, void *y, + cblas_zswap_p = (void (*)(const int n, void* x, const int incx, void* y, const int incy))GET_FUNC(h_libcblas, "cblas_zswap"); if (cblas_zswap_p != NULL) cblas_zswap_p(n, x, incx, y, incy); } } -static void cblas_cdotc_sub_wrapper(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres) { +static void cblas_cdotc_sub_wrapper(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres) { if (cblas_library() != NULL) { if (cblas_cdotc_sub_p == NULL) cblas_cdotc_sub_p = - (void (*)(const int n, const void *x, const int incx, const void *y, const int incy, - void *pres))GET_FUNC(h_libcblas, "cblas_cdotc_sub"); + (void (*)(const int n, const void* x, const int incx, const void* y, const int incy, + void* pres))GET_FUNC(h_libcblas, "cblas_cdotc_sub"); if (cblas_cdotc_sub_p != NULL) cblas_cdotc_sub_p(n, x, incx, y, incy, pres); } } -static void cblas_zdotc_sub_wrapper(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres) { +static void cblas_zdotc_sub_wrapper(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres) { if (cblas_library() != NULL) { if (cblas_zdotc_sub_p == NULL) cblas_zdotc_sub_p = - (void (*)(const int n, const void *x, const int incx, const void *y, const int incy, - void *pres))GET_FUNC(h_libcblas, "cblas_zdotc_sub"); + (void (*)(const int n, const void* x, const int incx, const void* y, const int incy, + void* pres))GET_FUNC(h_libcblas, "cblas_zdotc_sub"); if (cblas_zdotc_sub_p != NULL) cblas_zdotc_sub_p(n, x, incx, y, incy, pres); } } -static void cblas_cdotu_sub_wrapper(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres) { +static void cblas_cdotu_sub_wrapper(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres) { if (cblas_library() != NULL) { if (cblas_cdotu_sub_p == NULL) cblas_cdotu_sub_p = - (void (*)(const int n, const void *x, const int incx, const void *y, const int incy, - void *pres))GET_FUNC(h_libcblas, "cblas_cdotu_sub"); + (void (*)(const int n, const void* x, const int incx, const void* y, const int incy, + void* pres))GET_FUNC(h_libcblas, "cblas_cdotu_sub"); if (cblas_cdotu_sub_p != NULL) cblas_cdotu_sub_p(n, x, incx, y, incy, pres); } } -static void cblas_zdotu_sub_wrapper(const int n, const void *x, const int incx, const void *y, - const int incy, void *pres) { +static void cblas_zdotu_sub_wrapper(const int n, const void* x, const int incx, const void* y, + const int incy, void* pres) { if (cblas_library() != NULL) { if (cblas_zdotu_sub_p == NULL) cblas_zdotu_sub_p = - (void (*)(const int n, const void *x, const int incx, const void *y, const int incy, - void *pres))GET_FUNC(h_libcblas, "cblas_zdotu_sub"); + (void (*)(const int n, const void* x, const int incx, const void* y, const int incy, + void* pres))GET_FUNC(h_libcblas, "cblas_zdotu_sub"); if (cblas_zdotu_sub_p != NULL) cblas_zdotu_sub_p(n, x, incx, y, incy, pres); } } -static int cblas_isamax_wrapper(const int n, const float *x, const int incx) { +static int cblas_isamax_wrapper(const int n, const float* x, const int incx) { int isamax_res = 0; if (cblas_library() != NULL) { if (cblas_isamax_p == NULL) - cblas_isamax_p = (int (*)(const int n, const float *x, const int incx))GET_FUNC( + cblas_isamax_p = (int (*)(const int n, const float* x, const int incx))GET_FUNC( h_libcblas, "cblas_isamax"); if (cblas_isamax_p != NULL) isamax_res = cblas_isamax_p(n, x, incx); @@ -2376,11 +2376,11 @@ static int cblas_isamax_wrapper(const int n, const float *x, const int incx) { return isamax_res; } -static int cblas_idamax_wrapper(const int n, const double *x, const int incx) { +static int cblas_idamax_wrapper(const int n, const double* x, const int incx) { int idamax_res = 0; if (cblas_library() != NULL) { if (cblas_idamax_p == NULL) - cblas_idamax_p = (int (*)(const int n, const double *x, const int incx))GET_FUNC( + cblas_idamax_p = (int (*)(const int n, const double* x, const int incx))GET_FUNC( h_libcblas, "cblas_idamax"); if (cblas_idamax_p != NULL) idamax_res = cblas_idamax_p(n, x, incx); @@ -2388,11 +2388,11 @@ static int cblas_idamax_wrapper(const int n, const double *x, const int incx) { return idamax_res; } -static int cblas_icamax_wrapper(const int n, const void *x, const int incx) { +static int cblas_icamax_wrapper(const int n, const void* x, const int incx) { int icamax_res = 0; if (cblas_library() != NULL) { if (cblas_icamax_p == NULL) - cblas_icamax_p = (int (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_icamax_p = (int (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_icamax"); if (cblas_icamax_p != NULL) icamax_res = cblas_icamax_p(n, x, incx); @@ -2400,11 +2400,11 @@ static int cblas_icamax_wrapper(const int n, const void *x, const int incx) { return icamax_res; } -static int cblas_izamax_wrapper(const int n, const void *x, const int incx) { +static int cblas_izamax_wrapper(const int n, const void* x, const int incx) { int izamax_res = 0; if (cblas_library() != NULL) { if (cblas_izamax_p == NULL) - cblas_izamax_p = (int (*)(const int n, const void *x, const int incx))GET_FUNC( + cblas_izamax_p = (int (*)(const int n, const void* x, const int incx))GET_FUNC( h_libcblas, "cblas_izamax"); if (cblas_izamax_p != NULL) izamax_res = cblas_izamax_p(n, x, incx); diff --git a/tests/unit_tests/blas/include/test_common.hpp b/tests/unit_tests/blas/include/test_common.hpp index 5d607991e..0b64d3acc 100644 --- a/tests/unit_tests/blas/include/test_common.hpp +++ b/tests/unit_tests/blas/include/test_common.hpp @@ -86,7 +86,7 @@ constexpr T matrix_size(oneapi::mkl::layout layout, oneapi::mkl::transpose trans // SYCL buffer creation helper. template -sycl::buffer make_buffer(const vec &v) { +sycl::buffer make_buffer(const vec& v) { sycl::buffer buf(v.data(), sycl::range<1>(v.size())); return buf; } @@ -174,14 +174,14 @@ std::complex rand_scalar(int mag) { } template -void rand_vector(fp *v, int n, int inc) { +void rand_vector(fp* v, int n, int inc) { int abs_inc = std::abs(inc); for (int i = 0; i < n; i++) v[i * abs_inc] = rand_scalar(); } template -void rand_vector(vec &v, int n, int inc) { +void rand_vector(vec& v, int n, int inc) { using fp = typename vec::value_type; int abs_inc = std::abs(inc); @@ -209,7 +209,7 @@ oneapi::mkl::transpose rand_trans() { } template -void print_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld, char *name) { +void print_matrix(vec& M, oneapi::mkl::transpose trans, int m, int n, int ld, char* name) { std::cout << "Matrix " << name << ":\n"; for (int i = 0; i < m; i++) { for (int j = 0; j < n; j++) { @@ -223,15 +223,15 @@ void print_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld, ch } template -void copy_vector(fp *src, int n, int inc, fp *dest) { +void copy_vector(fp* src, int n, int inc, fp* dest) { int abs_inc = std::abs(inc); for (int i = 0; i < n; i++) dest[i * abs_inc] = src[i * abs_inc]; } template -void copy_matrix(vec_src &src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, - int n, int ld, vec_dest &dest) { +void copy_matrix(vec_src& src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, + int n, int ld, vec_dest& dest) { using T_data = typename vec_dest::value_type; dest.resize(matrix_size(layout, trans, m, n, ld)); if (((trans == oneapi::mkl::transpose::nontrans) && @@ -250,8 +250,8 @@ void copy_matrix(vec_src &src, oneapi::mkl::layout layout, oneapi::mkl::transpos } template -void copy_matrix(fp_src *src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, - int n, int ld, fp_dst *dest) { +void copy_matrix(fp_src* src, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, + int n, int ld, fp_dst* dest) { if (((trans == oneapi::mkl::transpose::nontrans) && (layout == oneapi::mkl::layout::col_major)) || ((trans != oneapi::mkl::transpose::nontrans) && @@ -268,7 +268,7 @@ void copy_matrix(fp_src *src, oneapi::mkl::layout layout, oneapi::mkl::transpose } template -void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) { +void rand_matrix(vec& M, oneapi::mkl::transpose trans, int m, int n, int ld) { using fp = typename vec::value_type; M.resize(matrix_size(trans, m, n, ld)); @@ -286,7 +286,7 @@ void rand_matrix(vec &M, oneapi::mkl::transpose trans, int m, int n, int ld) { } template -void rand_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, +void rand_matrix(vec& M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, int ld) { using fp = typename vec::value_type; @@ -308,7 +308,7 @@ void rand_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose tran } template -void rand_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, +void rand_matrix(fp* M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, int ld) { if (((trans == oneapi::mkl::transpose::nontrans) && (layout == oneapi::mkl::layout::col_major)) || @@ -326,7 +326,7 @@ void rand_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans } template -void rand_trsm_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, +void rand_trsm_matrix(vec& M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, int ld) { using fp = typename vec::value_type; @@ -356,7 +356,7 @@ void rand_trsm_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::transpose } template -void rand_trsm_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, +void rand_trsm_matrix(fp* M, oneapi::mkl::layout layout, oneapi::mkl::transpose trans, int m, int n, int ld) { if (((trans == oneapi::mkl::transpose::nontrans) && (layout == oneapi::mkl::layout::col_major)) || @@ -382,7 +382,7 @@ void rand_trsm_matrix(fp *M, oneapi::mkl::layout layout, oneapi::mkl::transpose } template -void rand_tpsv_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, +void rand_tpsv_matrix(vec& M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, int m) { using fp = typename vec::value_type; std::vector tmp; @@ -408,7 +408,7 @@ void rand_tpsv_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::uplo uppe } template -void rand_tbsv_matrix(vec &M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, +void rand_tbsv_matrix(vec& M, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans, int m, int k, int ld) { using fp = typename vec::value_type; std::vector tmp; @@ -461,7 +461,7 @@ typename std::enable_if::value, bool>::type check_equal(fp } template -bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag) { +bool check_equal_ptr(sycl::queue queue, fp* x, fp x_ref, int error_mag) { fp x_host; queue.memcpy(&x_host, x, sizeof(fp)).wait(); return check_equal(x_host, x_ref, error_mag); @@ -485,7 +485,7 @@ bool check_equal_trsm(fp x, fp x_ref, int error_mag) { } template -bool check_equal(fp x, fp x_ref, int error_mag, std::ostream &out) { +bool check_equal(fp x, fp x_ref, int error_mag, std::ostream& out) { bool good = check_equal(x, x_ref, error_mag); if (!good) { @@ -495,15 +495,15 @@ bool check_equal(fp x, fp x_ref, int error_mag, std::ostream &out) { } template -bool check_equal_ptr(sycl::queue queue, fp *x, fp x_ref, int error_mag, std::ostream &out) { +bool check_equal_ptr(sycl::queue queue, fp* x, fp x_ref, int error_mag, std::ostream& out) { fp x_host; queue.memcpy(&x_host, x, sizeof(fp)).wait(); return check_equal(x_host, x_ref, error_mag, out); } template -bool check_equal_vector(const fp *v, const fp *v_ref, int n, int inc, int error_mag, - std::ostream &out) { +bool check_equal_vector(const fp* v, const fp* v_ref, int n, int inc, int error_mag, + std::ostream& out) { int abs_inc = std::abs(inc), count = 0; bool good = true; @@ -523,7 +523,7 @@ bool check_equal_vector(const fp *v, const fp *v_ref, int n, int inc, int error_ } template -bool check_equal_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag, std::ostream &out) { +bool check_equal_vector(vec1& v, vec2& v_ref, int n, int inc, int error_mag, std::ostream& out) { int abs_inc = std::abs(inc), count = 0; bool good = true; @@ -543,8 +543,8 @@ bool check_equal_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag, std } template -bool check_equal_trsv_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag, - std::ostream &out) { +bool check_equal_trsv_vector(vec1& v, vec2& v_ref, int n, int inc, int error_mag, + std::ostream& out) { int abs_inc = std::abs(inc), count = 0; bool good = true; @@ -564,8 +564,8 @@ bool check_equal_trsv_vector(vec1 &v, vec2 &v_ref, int n, int inc, int error_mag } template -bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, int m, int n, int ld, - int error_mag, std::ostream &out) { +bool check_equal_matrix(acc1& M, acc2& M_ref, oneapi::mkl::layout layout, int m, int n, int ld, + int error_mag, std::ostream& out) { bool good = true; int idx, count = 0; for (int j = 0; j < n; j++) { @@ -586,8 +586,8 @@ bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, int m, } template -bool check_equal_matrix(const fp *M, const fp *M_ref, oneapi::mkl::layout layout, int m, int n, - int ld, int error_mag, std::ostream &out) { +bool check_equal_matrix(const fp* M, const fp* M_ref, oneapi::mkl::layout layout, int m, int n, + int ld, int error_mag, std::ostream& out) { bool good = true; int idx, count = 0; for (int j = 0; j < n; j++) { @@ -608,9 +608,9 @@ bool check_equal_matrix(const fp *M, const fp *M_ref, oneapi::mkl::layout layout } template -bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, +bool check_equal_matrix(acc1& M, acc2& M_ref, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int m, int n, int ld, int error_mag, - std::ostream &out) { + std::ostream& out) { bool good = true; int idx, count = 0; for (int j = 0; j < n; j++) { @@ -634,8 +634,8 @@ bool check_equal_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, } template -bool check_equal_trsm_matrix(acc1 &M, acc2 &M_ref, oneapi::mkl::layout layout, int m, int n, int ld, - int error_mag, std::ostream &out) { +bool check_equal_trsm_matrix(acc1& M, acc2& M_ref, oneapi::mkl::layout layout, int m, int n, int ld, + int error_mag, std::ostream& out) { bool good = true; int idx, count = 0; for (int j = 0; j < n; j++) { @@ -677,8 +677,8 @@ typename std::enable_if::value, bool>::type check_almost_eq } template -bool check_almost_equal_matrix_int(Ta &M, Tb &M_ref, oneapi::mkl::layout layout, int m, int n, - int ld, int error_mag, std::ostream &out) { +bool check_almost_equal_matrix_int(Ta& M, Tb& M_ref, oneapi::mkl::layout layout, int m, int n, + int ld, int error_mag, std::ostream& out) { static_assert(is_matrix_type_integral() && is_matrix_type_integral()); bool good = true; int idx, count = 0; @@ -700,8 +700,8 @@ bool check_almost_equal_matrix_int(Ta &M, Tb &M_ref, oneapi::mkl::layout layout, } template -bool check_almost_equal_matrix(Ta &M, Tb &M_ref, oneapi::mkl::layout layout, int m, int n, int ld, - int error_mag, std::ostream &out) { +bool check_almost_equal_matrix(Ta& M, Tb& M_ref, oneapi::mkl::layout layout, int m, int n, int ld, + int error_mag, std::ostream& out) { // Only call if returned dtype is integral if constexpr (is_matrix_type_integral() && is_matrix_type_integral()) return check_almost_equal_matrix_int(M, M_ref, layout, m, n, ld, error_mag, out); diff --git a/tests/unit_tests/blas/level1/axpby.cpp b/tests/unit_tests/blas/level1/axpby.cpp index d43f9beda..4234e5259 100644 --- a/tests/unit_tests/blas/level1/axpby.cpp +++ b/tests/unit_tests/blas/level1/axpby.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) { // Prepare data. vector x, y, y_ref; @@ -58,18 +58,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::axpby(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + ::axpby(&N_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ AXPBY. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPBY:\n" << e.what() << std::endl; print_error_code(e); @@ -109,16 +109,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPBY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPBY:\n" << error.what() << std::endl; } @@ -130,8 +130,8 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp return (int)good; } -class AxpbyTests - : public ::testing::TestWithParam> {}; +class AxpbyTests : public ::testing::TestWithParam> { +}; TEST_P(AxpbyTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level1/axpby_usm.cpp b/tests/unit_tests/blas/level1/axpby_usm.cpp index ae85ca8f1..1459f1900 100644 --- a/tests/unit_tests/blas/level1/axpby_usm.cpp +++ b/tests/unit_tests/blas/level1/axpby_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha, fp beta) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPBY:\n" << e.what() << std::endl; print_error_code(e); @@ -79,8 +79,8 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::axpby(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + ::axpby(&N_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ AXPBY. @@ -113,16 +113,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPBY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPBY:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } class AxpbyUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(AxpbyUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level1/axpy.cpp b/tests/unit_tests/blas/level1/axpy.cpp index c81f2902d..a0fbdc4c6 100644 --- a/tests/unit_tests/blas/level1/axpy.cpp +++ b/tests/unit_tests/blas/level1/axpy.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) { // Prepare data. vector x, y, y_ref; @@ -58,18 +58,17 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(), - &incy_ref); + ::axpy(&N_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ AXPY. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPY:\n" << e.what() << std::endl; print_error_code(e); @@ -109,16 +108,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl; } @@ -130,7 +129,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp return (int)good; } -class AxpyTests : public ::testing::TestWithParam> { +class AxpyTests : public ::testing::TestWithParam> { }; TEST_P(AxpyTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level1/axpy_usm.cpp b/tests/unit_tests/blas/level1/axpy_usm.cpp index da68f173c..651b70a58 100644 --- a/tests/unit_tests/blas/level1/axpy_usm.cpp +++ b/tests/unit_tests/blas/level1/axpy_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp alpha) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during AXPY:\n" << e.what() << std::endl; print_error_code(e); @@ -79,8 +79,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::axpy(&N_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y_ref.data(), - &incy_ref); + ::axpy(&N_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ AXPY. @@ -113,16 +112,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during AXPY:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of AXPY:\n" << error.what() << std::endl; } @@ -134,7 +133,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } class AxpyUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(AxpyUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level1/dotc.cpp b/tests/unit_tests/blas/level1/dotc.cpp index cb8d0fc37..f420a5e9f 100644 --- a/tests/unit_tests/blas/level1/dotc.cpp +++ b/tests/unit_tests/blas/level1/dotc.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) { // Prepare data. vector x, y; fp result = 0.0, result_reference = 0.0; @@ -58,18 +58,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + ::dotc((fp_ref*)&result_reference, &N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref); // Call DPC++ DOTC. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DOTC:\n" << e.what() << std::endl; print_error_code(e); @@ -110,16 +110,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DOTC:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl; } @@ -131,7 +131,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { return (int)good; } -class DotcTests : public ::testing::TestWithParam> { +class DotcTests : public ::testing::TestWithParam> { }; TEST_P(DotcTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level1/dotc_usm.cpp b/tests/unit_tests/blas/level1/dotc_usm.cpp index ad05c9d3b..9c08125f3 100644 --- a/tests/unit_tests/blas/level1/dotc_usm.cpp +++ b/tests/unit_tests/blas/level1/dotc_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DOTC:\n" << e.what() << std::endl; print_error_code(e); @@ -78,12 +78,12 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::dotc((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + ::dotc((fp_ref*)&result_reference, &N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref); // Call DPC++ DOTC. - auto result_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + auto result_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); try { #ifdef CALL_RT_API @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DOTC:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DOTC:\n" << error.what() << std::endl; } @@ -137,7 +137,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { } class DotcUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(DotcUsmTests, ComplexSinglePrecision) { EXPECT_TRUEORSKIP( diff --git a/tests/unit_tests/blas/level1/dotu.cpp b/tests/unit_tests/blas/level1/dotu.cpp index bbef3ad8c..b6b3dd536 100644 --- a/tests/unit_tests/blas/level1/dotu.cpp +++ b/tests/unit_tests/blas/level1/dotu.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) { // Prepare data. vector x, y; fp result = 0.0, result_reference = 0.0; @@ -58,18 +58,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + ::dotu((fp_ref*)&result_reference, &N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref); // Call DPC++ DOTU. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DOTU:\n" << e.what() << std::endl; print_error_code(e); @@ -110,16 +110,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DOTU:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl; } @@ -131,7 +131,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { return (int)good; } -class DotuTests : public ::testing::TestWithParam> { +class DotuTests : public ::testing::TestWithParam> { }; TEST_P(DotuTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level1/dotu_usm.cpp b/tests/unit_tests/blas/level1/dotu_usm.cpp index 3f30bf5ff..6f7c4a63f 100644 --- a/tests/unit_tests/blas/level1/dotu_usm.cpp +++ b/tests/unit_tests/blas/level1/dotu_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during DOTU:\n" << e.what() << std::endl; print_error_code(e); @@ -78,12 +78,12 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::dotu((fp_ref *)&result_reference, &N_ref, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), + ::dotu((fp_ref*)&result_reference, &N_ref, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref); // Call DPC++ DOTU. - auto result_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + auto result_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); try { #ifdef CALL_RT_API @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during DOTU:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of DOTU:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy) { } class DotuUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(DotuUsmTests, ComplexSinglePrecision) { EXPECT_TRUEORSKIP( diff --git a/tests/unit_tests/blas/level1/rot.cpp b/tests/unit_tests/blas/level1/rot.cpp index f65540182..12a26ce71 100644 --- a/tests/unit_tests/blas/level1/rot.cpp +++ b/tests/unit_tests/blas/level1/rot.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c, +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c, fp_scalar s) { // Prepare data. vector x, x_ref, y, y_ref; @@ -59,18 +59,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, - (fp_scalar *)&c, (fp_scalar *)&s); + ::rot(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref, + (fp_scalar*)&c, (fp_scalar*)&s); // Call DPC++ ROT. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROT:\n" << e.what() << std::endl; print_error_code(e); @@ -110,16 +110,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROT:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl; } @@ -135,8 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ return (int)good; } -class RotTests : public ::testing::TestWithParam> { -}; +class RotTests : public ::testing::TestWithParam> {}; TEST_P(RotTests, RealSinglePrecision) { float c(2.0); diff --git a/tests/unit_tests/blas/level1/rot_usm.cpp b/tests/unit_tests/blas/level1/rot_usm.cpp index 287ac285b..6c19b0ceb 100644 --- a/tests/unit_tests/blas/level1/rot_usm.cpp +++ b/tests/unit_tests/blas/level1/rot_usm.cpp @@ -41,20 +41,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c, +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_scalar c, fp_scalar s) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROT:\n" << e.what() << std::endl; print_error_code(e); @@ -80,8 +80,8 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::rot(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, - (fp_scalar *)&c, (fp_scalar *)&s); + ::rot(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref, + (fp_scalar*)&c, (fp_scalar*)&s); // Call DPC++ ROT. @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROT:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROT:\n" << error.what() << std::endl; } @@ -137,7 +137,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp_ } class RotUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(RotUsmTests, RealSinglePrecision) { float c(2.0); diff --git a/tests/unit_tests/blas/level1/rotg.cpp b/tests/unit_tests/blas/level1/rotg.cpp index 1a0d569d8..4abcddd39 100644 --- a/tests/unit_tests/blas/level1/rotg.cpp +++ b/tests/unit_tests/blas/level1/rotg.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Prepare data. fp a, b, s, a_ref, b_ref, s_ref; fp_scalar c, c_ref; @@ -64,17 +64,17 @@ int test(device *dev, oneapi::mkl::layout layout) { // Call Reference ROTG. using fp_ref = typename ref_type_info::type; - ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref); + ::rotg((fp_ref*)&a_ref, (fp_ref*)&b_ref, (fp_scalar*)&c_ref, (fp_ref*)&s_ref); // Call DPC++ ROTG. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROTG:\n" << e.what() << std::endl; print_error_code(e); @@ -116,16 +116,16 @@ int test(device *dev, oneapi::mkl::layout layout) { } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROTG:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl; } @@ -144,7 +144,7 @@ int test(device *dev, oneapi::mkl::layout layout) { return (int)good; } -class RotgTests : public ::testing::TestWithParam> { +class RotgTests : public ::testing::TestWithParam> { }; TEST_P(RotgTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level1/rotg_usm.cpp b/tests/unit_tests/blas/level1/rotg_usm.cpp index de71a793d..d078ff03a 100644 --- a/tests/unit_tests/blas/level1/rotg_usm.cpp +++ b/tests/unit_tests/blas/level1/rotg_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROTG:\n" << e.what() << std::endl; print_error_code(e); @@ -83,22 +83,22 @@ int test(device *dev, oneapi::mkl::layout layout) { // Call Reference ROTG. using fp_ref = typename ref_type_info::type; - ::rotg((fp_ref *)&a_ref, (fp_ref *)&b_ref, (fp_scalar *)&c_ref, (fp_ref *)&s_ref); + ::rotg((fp_ref*)&a_ref, (fp_ref*)&b_ref, (fp_scalar*)&c_ref, (fp_ref*)&s_ref); // Call DPC++ ROTG. fp *a_p, *b_p, *s_p; - fp_scalar *c_p; + fp_scalar* c_p; if constexpr (alloc_type == usm::alloc::shared) { - a_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); - b_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); - s_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); - c_p = (fp_scalar *)oneapi::mkl::malloc_shared(64, sizeof(fp_scalar), *dev, cxt); + a_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + b_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + s_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + c_p = (fp_scalar*)oneapi::mkl::malloc_shared(64, sizeof(fp_scalar), *dev, cxt); } else if constexpr (alloc_type == usm::alloc::device) { - a_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); - b_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); - s_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); - c_p = (fp_scalar *)oneapi::mkl::malloc_device(64, sizeof(fp_scalar), *dev, cxt); + a_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + b_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + s_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + c_p = (fp_scalar*)oneapi::mkl::malloc_device(64, sizeof(fp_scalar), *dev, cxt); } else { throw std::runtime_error("Bad alloc_type"); @@ -139,16 +139,16 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROTG:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROTG:\n" << error.what() << std::endl; } @@ -170,7 +170,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class RotgUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(RotgUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP((test(std::get<0>(GetParam()), std::get<1>(GetParam())))); diff --git a/tests/unit_tests/blas/level1/rotm.cpp b/tests/unit_tests/blas/level1/rotm.cpp index ab2c599bf..4e4ba44ec 100644 --- a/tests/unit_tests/blas/level1/rotm.cpp +++ b/tests/unit_tests/blas/level1/rotm.cpp @@ -41,12 +41,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) { // Prepare data. vector x, x_ref, y, y_ref; vector param; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, - (fp_ref *)param.data()); + ::rotm(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref, + (fp_ref*)param.data()); // Call DPC++ ROTM. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROTM:\n" << e.what() << std::endl; print_error_code(e); @@ -113,16 +113,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROTM:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp return (int)good; } -class RotmTests : public ::testing::TestWithParam> { +class RotmTests : public ::testing::TestWithParam> { }; TEST_P(RotmTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level1/rotm_usm.cpp b/tests/unit_tests/blas/level1/rotm_usm.cpp index 7723e096c..79ce634a1 100644 --- a/tests/unit_tests/blas/level1/rotm_usm.cpp +++ b/tests/unit_tests/blas/level1/rotm_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp flag) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROTM:\n" << e.what() << std::endl; print_error_code(e); @@ -81,8 +81,8 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp using fp_ref = typename ref_type_info::type; const int N_ref = N, incx_ref = incx, incy_ref = incy; - ::rotm(&N_ref, (fp_ref *)x_ref.data(), &incx_ref, (fp_ref *)y_ref.data(), &incy_ref, - (fp_ref *)param.data()); + ::rotm(&N_ref, (fp_ref*)x_ref.data(), &incx_ref, (fp_ref*)y_ref.data(), &incy_ref, + (fp_ref*)param.data()); // Call DPC++ ROTM. @@ -115,16 +115,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROTM:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROTM:\n" << error.what() << std::endl; } @@ -138,7 +138,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, fp } class RotmUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(RotmUsmTests, RealSinglePrecision) { float flag(-1.0); diff --git a/tests/unit_tests/blas/level1/rotmg_usm.cpp b/tests/unit_tests/blas/level1/rotmg_usm.cpp index 92eeee491..0afe7caca 100644 --- a/tests/unit_tests/blas/level1/rotmg_usm.cpp +++ b/tests/unit_tests/blas/level1/rotmg_usm.cpp @@ -41,19 +41,19 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout) { +int test(device* dev, oneapi::mkl::layout layout) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during ROTMG:\n" << e.what() << std::endl; print_error_code(e); @@ -82,14 +82,14 @@ int test(device *dev, oneapi::mkl::layout layout) { fp *d1_p, *d2_p, *x1_p; if constexpr (alloc_type == usm::alloc::device) { - d1_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); - d2_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); - x1_p = (fp *)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + d1_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + d2_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); + x1_p = (fp*)oneapi::mkl::malloc_device(64, sizeof(fp), *dev, cxt); } else if constexpr (alloc_type == usm::alloc::shared) { - d1_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); - d2_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); - x1_p = (fp *)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + d1_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + d2_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); + x1_p = (fp*)oneapi::mkl::malloc_shared(64, sizeof(fp), *dev, cxt); } else { throw std::runtime_error("Bad alloc_type"); @@ -101,7 +101,7 @@ int test(device *dev, oneapi::mkl::layout layout) { // Call Reference ROTMG. - ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp *)param_ref.data()); + ::rotmg(&d1_ref, &d2_ref, &x1_ref, &y1, (fp*)param_ref.data()); // Call DPC++ ROTMG. @@ -134,16 +134,16 @@ int test(device *dev, oneapi::mkl::layout layout) { main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during ROTMG:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of ROTMG:\n" << error.what() << std::endl; } @@ -213,7 +213,7 @@ int test(device *dev, oneapi::mkl::layout layout) { } class RotmgUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(RotmgUsmTests, RealSinglePrecision) { EXPECT_TRUEORSKIP(test(std::get<0>(GetParam()), std::get<1>(GetParam()))); diff --git a/tests/unit_tests/blas/level1/sdsdot.cpp b/tests/unit_tests/blas/level1/sdsdot.cpp index 7293a3699..1030713f0 100644 --- a/tests/unit_tests/blas/level1/sdsdot.cpp +++ b/tests/unit_tests/blas/level1/sdsdot.cpp @@ -41,11 +41,11 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) { // Prepare data. vector x, y; float result = float(-1), result_ref = float(-1); @@ -56,18 +56,18 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo // Call Reference SDSDOT. const int N_ref = N, incx_ref = incx, incy_ref = incy; - result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(), - &incy_ref); + result_ref = + ::sdsdot(&N_ref, (float*)&alpha, (float*)x.data(), &incx_ref, (float*)y.data(), &incy_ref); // Call DPC++ SDSDOT. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl; print_error_code(e); @@ -108,16 +108,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl; } @@ -130,7 +130,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo } class SdsdotTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SdsdotTests, RealSinglePrecision) { CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam())); diff --git a/tests/unit_tests/blas/level1/sdsdot_usm.cpp b/tests/unit_tests/blas/level1/sdsdot_usm.cpp index a5740516c..ab0221754 100644 --- a/tests/unit_tests/blas/level1/sdsdot_usm.cpp +++ b/tests/unit_tests/blas/level1/sdsdot_usm.cpp @@ -41,18 +41,18 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { -int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) { +int test(device* dev, oneapi::mkl::layout layout, int N, int incx, int incy, float alpha) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl; print_error_code(e); @@ -76,12 +76,12 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo // Call Reference SDSDOT. const int N_ref = N, incx_ref = incx, incy_ref = incy; - result_ref = ::sdsdot(&N_ref, (float *)&alpha, (float *)x.data(), &incx_ref, (float *)y.data(), - &incy_ref); + result_ref = + ::sdsdot(&N_ref, (float*)&alpha, (float*)x.data(), &incx_ref, (float*)y.data(), &incy_ref); // Call DPC++ SDSDOT. - auto result_p = (float *)oneapi::mkl::malloc_shared(64, sizeof(float), *dev, cxt); + auto result_p = (float*)oneapi::mkl::malloc_shared(64, sizeof(float), *dev, cxt); try { #ifdef CALL_RT_API @@ -113,16 +113,16 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SDSDOT:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SDSDOT:\n" << error.what() << std::endl; } @@ -135,7 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, int N, int incx, int incy, flo } class SdsdotUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SdsdotUsmTests, RealSinglePrecision) { CHECK_DOUBLE_ON_DEVICE(std::get<0>(GetParam())); diff --git a/tests/unit_tests/blas/level2/gbmv.cpp b/tests/unit_tests/blas/level2/gbmv.cpp index 94fcbc906..20bc75490 100644 --- a/tests/unit_tests/blas/level2/gbmv.cpp +++ b/tests/unit_tests/blas/level2/gbmv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, int kl, int ku, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. int x_len = outer_dimension(transa, m, n); @@ -66,18 +66,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, using fp_ref = typename ref_type_info::type; ::gbmv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref, - &ku_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + &ku_ref, (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ GBMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -122,16 +122,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl; } @@ -142,7 +142,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, return (int)good; } -class GbmvTests : public ::testing::TestWithParam> { +class GbmvTests : public ::testing::TestWithParam> { }; TEST_P(GbmvTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/gbmv_usm.cpp b/tests/unit_tests/blas/level2/gbmv_usm.cpp index 9d92fcf7e..ea66daab4 100644 --- a/tests/unit_tests/blas/level2/gbmv_usm.cpp +++ b/tests/unit_tests/blas/level2/gbmv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, int kl, int ku, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -86,8 +86,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, using fp_ref = typename ref_type_info::type; ::gbmv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, &kl_ref, - &ku_ref, (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + &ku_ref, (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ GBMV. @@ -124,16 +124,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GBMV:\n" << error.what() << std::endl; } @@ -145,7 +145,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, } class GbmvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GbmvUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/gemv.cpp b/tests/unit_tests/blas/level2/gemv.cpp index 3bfff4324..bd15ab54b 100644 --- a/tests/unit_tests/blas/level2/gemv.cpp +++ b/tests/unit_tests/blas/level2/gemv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. int x_len = outer_dimension(transa, m, n); @@ -65,18 +65,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, using fp_ref = typename ref_type_info::type; ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ GEMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMV:\n" << e.what() << std::endl; print_error_code(e); @@ -119,16 +119,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl; } @@ -139,7 +139,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, return (int)good; } -class GemvTests : public ::testing::TestWithParam> { +class GemvTests : public ::testing::TestWithParam> { }; TEST_P(GemvTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/gemv_usm.cpp b/tests/unit_tests/blas/level2/gemv_usm.cpp index d1e726e38..a513ab149 100644 --- a/tests/unit_tests/blas/level2/gemv_usm.cpp +++ b/tests/unit_tests/blas/level2/gemv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, int m, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GEMV:\n" << e.what() << std::endl; print_error_code(e); @@ -85,8 +85,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, using fp_ref = typename ref_type_info::type; ::gemv(convert_to_cblas_layout(layout), convert_to_cblas_trans(transa), &m_ref, &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ GEMV. @@ -123,16 +123,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GEMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GEMV:\n" << error.what() << std::endl; } @@ -144,7 +144,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::transpose transa, } class GemvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GemvUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/ger.cpp b/tests/unit_tests/blas/level2/ger.cpp index 3b32d2827..7610239ad 100644 --- a/tests/unit_tests/blas/level2/ger.cpp +++ b/tests/unit_tests/blas/level2/ger.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GER. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GER:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GER:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl; } @@ -135,8 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in return (int)good; } -class GerTests : public ::testing::TestWithParam> { -}; +class GerTests : public ::testing::TestWithParam> {}; TEST_P(GerTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/ger_usm.cpp b/tests/unit_tests/blas/level2/ger_usm.cpp index 87087f026..c9bece6b8 100644 --- a/tests/unit_tests/blas/level2/ger_usm.cpp +++ b/tests/unit_tests/blas/level2/ger_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GER:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::ger(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GER. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GER:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GER:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } class GerUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GerUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/gerc.cpp b/tests/unit_tests/blas/level2/gerc.cpp index c19c9f029..e918bbf92 100644 --- a/tests/unit_tests/blas/level2/gerc.cpp +++ b/tests/unit_tests/blas/level2/gerc.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GERC. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GERC:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GERC:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl; } @@ -135,7 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in return (int)good; } -class GercTests : public ::testing::TestWithParam> { +class GercTests : public ::testing::TestWithParam> { }; TEST_P(GercTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/gerc_usm.cpp b/tests/unit_tests/blas/level2/gerc_usm.cpp index b6473484d..c9f04060d 100644 --- a/tests/unit_tests/blas/level2/gerc_usm.cpp +++ b/tests/unit_tests/blas/level2/gerc_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GERC:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::gerc(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GERC. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GERC:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GERC:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } class GercUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GercUsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/geru.cpp b/tests/unit_tests/blas/level2/geru.cpp index e0cb7c45d..23af195cf 100644 --- a/tests/unit_tests/blas/level2/geru.cpp +++ b/tests/unit_tests/blas/level2/geru.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GERU. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GERU:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GERU:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl; } @@ -135,7 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in return (int)good; } -class GeruTests : public ::testing::TestWithParam> { +class GeruTests : public ::testing::TestWithParam> { }; TEST_P(GeruTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/geru_usm.cpp b/tests/unit_tests/blas/level2/geru_usm.cpp index 1e882bd97..31f4e2116 100644 --- a/tests/unit_tests/blas/level2/geru_usm.cpp +++ b/tests/unit_tests/blas/level2/geru_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, +int test(device* dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during GERU:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in const int m_ref = m, n_ref = n, incx_ref = incx, incy_ref = incy, lda_ref = lda; using fp_ref = typename ref_type_info::type; - ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref *)&alpha, (fp_ref *)x.data(), - &incx_ref, (fp_ref *)y.data(), &incy_ref, (fp_ref *)A_ref.data(), &lda_ref); + ::geru(convert_to_cblas_layout(layout), &m_ref, &n_ref, (fp_ref*)&alpha, (fp_ref*)x.data(), + &incx_ref, (fp_ref*)y.data(), &incy_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ GERU. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during GERU:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of GERU:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, int m, int n, fp alpha, int in } class GeruUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(GeruUsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/hbmv.cpp b/tests/unit_tests/blas/level2/hbmv.cpp index 119aef32a..aa2b51ffa 100644 --- a/tests/unit_tests/blas/level2/hbmv.cpp +++ b/tests/unit_tests/blas/level2/hbmv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. vector x, y, y_ref, A; @@ -63,18 +63,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HBMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -118,16 +118,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl; } @@ -138,7 +138,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class HbmvTests : public ::testing::TestWithParam> { +class HbmvTests : public ::testing::TestWithParam> { }; TEST_P(HbmvTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/hbmv_usm.cpp b/tests/unit_tests/blas/level2/hbmv_usm.cpp index 60305cb93..183dc9e28 100644 --- a/tests/unit_tests/blas/level2/hbmv_usm.cpp +++ b/tests/unit_tests/blas/level2/hbmv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -84,8 +84,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HBMV. @@ -122,16 +122,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HBMV:\n" << error.what() << std::endl; } @@ -143,7 +143,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class HbmvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(HbmvUsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/hemv.cpp b/tests/unit_tests/blas/level2/hemv.cpp index 3636e3774..5e68db394 100644 --- a/tests/unit_tests/blas/level2/hemv.cpp +++ b/tests/unit_tests/blas/level2/hemv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. vector x, y, y_ref, A; @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hemv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HEMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HEMV:\n" << e.what() << std::endl; print_error_code(e); @@ -116,16 +116,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HEMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class HemvTests : public ::testing::TestWithParam> { +class HemvTests : public ::testing::TestWithParam> { }; TEST_P(HemvTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/hemv_usm.cpp b/tests/unit_tests/blas/level2/hemv_usm.cpp index a1b8093fc..a5c20b4b9 100644 --- a/tests/unit_tests/blas/level2/hemv_usm.cpp +++ b/tests/unit_tests/blas/level2/hemv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HEMV:\n" << e.what() << std::endl; print_error_code(e); @@ -83,8 +83,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hemv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HEMV. @@ -121,16 +121,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HEMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HEMV:\n" << error.what() << std::endl; } @@ -142,7 +142,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class HemvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(HemvUsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/her.cpp b/tests/unit_tests/blas/level2/her.cpp index 46ae9a879..8b0e77cf2 100644 --- a/tests/unit_tests/blas/level2/her.cpp +++ b/tests/unit_tests/blas/level2/her.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) { // Prepare data. vector x, A_ref, A; @@ -61,17 +61,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_scalar_mkl = typename ref_type_info::type; ::her(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + (fp_scalar_mkl*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ HER. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HER:\n" << e.what() << std::endl; print_error_code(e); @@ -111,16 +111,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HER:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl; } @@ -131,8 +131,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class HerTests : public ::testing::TestWithParam> { -}; +class HerTests : public ::testing::TestWithParam> {}; TEST_P(HerTests, ComplexSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/her2.cpp b/tests/unit_tests/blas/level2/her2.cpp index e98c5cc8b..9da2be96a 100644 --- a/tests/unit_tests/blas/level2/her2.cpp +++ b/tests/unit_tests/blas/level2/her2.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. vector x, y, A_ref, A; @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::her2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ HER2. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HER2:\n" << e.what() << std::endl; print_error_code(e); @@ -115,16 +115,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HER2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl; } @@ -135,7 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class Her2Tests : public ::testing::TestWithParam> { +class Her2Tests : public ::testing::TestWithParam> { }; TEST_P(Her2Tests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/her2_usm.cpp b/tests/unit_tests/blas/level2/her2_usm.cpp index c732331ee..6d65f18f4 100644 --- a/tests/unit_tests/blas/level2/her2_usm.cpp +++ b/tests/unit_tests/blas/level2/her2_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HER2:\n" << e.what() << std::endl; print_error_code(e); @@ -83,8 +83,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::her2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ HER2. @@ -121,16 +121,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HER2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HER2:\n" << error.what() << std::endl; } @@ -142,7 +142,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class Her2UsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Her2UsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/her_usm.cpp b/tests/unit_tests/blas/level2/her_usm.cpp index 9e1f5099e..083bd4f28 100644 --- a/tests/unit_tests/blas/level2/her_usm.cpp +++ b/tests/unit_tests/blas/level2/her_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp_scalar alpha, int incx, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HER:\n" << e.what() << std::endl; print_error_code(e); @@ -82,7 +82,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_scalar_mkl = typename ref_type_info::type; ::her(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + (fp_scalar_mkl*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ HER. @@ -116,16 +116,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HER:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HER:\n" << error.what() << std::endl; } @@ -137,7 +137,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class HerUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(HerUsmTests, ComplexSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/hpmv.cpp b/tests/unit_tests/blas/level2/hpmv.cpp index 69e6ea9b2..23f6c4d91 100644 --- a/tests/unit_tests/blas/level2/hpmv.cpp +++ b/tests/unit_tests/blas/level2/hpmv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy) { // Prepare data. vector x, y, y_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HPMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPMV:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class HpmvTests : public ::testing::TestWithParam> { +class HpmvTests : public ::testing::TestWithParam> { }; TEST_P(HpmvTests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/hpmv_usm.cpp b/tests/unit_tests/blas/level2/hpmv_usm.cpp index 743194b18..3766c7e7d 100644 --- a/tests/unit_tests/blas/level2/hpmv_usm.cpp +++ b/tests/unit_tests/blas/level2/hpmv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPMV:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hpmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ HPMV. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPMV:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class HpmvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(HpmvUsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/hpr.cpp b/tests/unit_tests/blas/level2/hpr.cpp index b2e5548bd..ca79e335a 100644 --- a/tests/unit_tests/blas/level2/hpr.cpp +++ b/tests/unit_tests/blas/level2/hpr.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp_scalar alpha, int incx) { // Prepare data. vector x, A_ref, A; @@ -61,17 +61,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_scalar_mkl = typename ref_type_info::type; ::hpr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data()); + (fp_scalar_mkl*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data()); // Call DPC++ HPR. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPR:\n" << e.what() << std::endl; print_error_code(e); @@ -111,16 +111,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl; } @@ -131,8 +131,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class HprTests : public ::testing::TestWithParam> { -}; +class HprTests : public ::testing::TestWithParam> {}; TEST_P(HprTests, ComplexSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/hpr2.cpp b/tests/unit_tests/blas/level2/hpr2.cpp index e2b19e2fd..22701fe3d 100644 --- a/tests/unit_tests/blas/level2/hpr2.cpp +++ b/tests/unit_tests/blas/level2/hpr2.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Prepare data. vector x, y, A_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hpr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data()); // Call DPC++ HPR2. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPR2:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class Hpr2Tests : public ::testing::TestWithParam> { +class Hpr2Tests : public ::testing::TestWithParam> { }; TEST_P(Hpr2Tests, ComplexSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/hpr2_usm.cpp b/tests/unit_tests/blas/level2/hpr2_usm.cpp index 6dc60dbf6..392f9a74b 100644 --- a/tests/unit_tests/blas/level2/hpr2_usm.cpp +++ b/tests/unit_tests/blas/level2/hpr2_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPR2:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::hpr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data()); // Call DPC++ HPR2. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPR2:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class Hpr2UsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Hpr2UsmTests, ComplexSinglePrecision) { std::complex alpha(2.0, -0.5); diff --git a/tests/unit_tests/blas/level2/hpr_usm.cpp b/tests/unit_tests/blas/level2/hpr_usm.cpp index b90b0ee63..708018e6d 100644 --- a/tests/unit_tests/blas/level2/hpr_usm.cpp +++ b/tests/unit_tests/blas/level2/hpr_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp_scalar alpha, int incx) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during HPR:\n" << e.what() << std::endl; print_error_code(e); @@ -82,7 +82,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_scalar_mkl = typename ref_type_info::type; ::hpr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_scalar_mkl *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data()); + (fp_scalar_mkl*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data()); // Call DPC++ HPR. @@ -116,16 +116,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during HPR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of HPR:\n" << error.what() << std::endl; } @@ -137,7 +137,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class HprUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(HprUsmTests, ComplexSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/sbmv.cpp b/tests/unit_tests/blas/level2/sbmv.cpp index c0347dfda..49df93ad1 100644 --- a/tests/unit_tests/blas/level2/sbmv.cpp +++ b/tests/unit_tests/blas/level2/sbmv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. vector x, y, y_ref, A; @@ -62,18 +62,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::sbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SBMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -116,16 +116,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class SbmvTests : public ::testing::TestWithParam> { +class SbmvTests : public ::testing::TestWithParam> { }; TEST_P(SbmvTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/sbmv_usm.cpp b/tests/unit_tests/blas/level2/sbmv_usm.cpp index 4fb7d46ad..43093cb24 100644 --- a/tests/unit_tests/blas/level2/sbmv_usm.cpp +++ b/tests/unit_tests/blas/level2/sbmv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, int k, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SBMV:\n" << e.what() << std::endl; print_error_code(e); @@ -83,8 +83,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::sbmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, &k_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SBMV. @@ -121,16 +121,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SBMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SBMV:\n" << error.what() << std::endl; } @@ -142,7 +142,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class SbmvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SbmvUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/spmv.cpp b/tests/unit_tests/blas/level2/spmv.cpp index 799e7d775..a2121fbac 100644 --- a/tests/unit_tests/blas/level2/spmv.cpp +++ b/tests/unit_tests/blas/level2/spmv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy) { // Prepare data. vector x, y, y_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SPMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPMV:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class SpmvTests : public ::testing::TestWithParam> { +class SpmvTests : public ::testing::TestWithParam> { }; TEST_P(SpmvTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/spmv_usm.cpp b/tests/unit_tests/blas/level2/spmv_usm.cpp index ae38ada4a..9dfe57383 100644 --- a/tests/unit_tests/blas/level2/spmv_usm.cpp +++ b/tests/unit_tests/blas/level2/spmv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPMV:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spmv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), (fp_ref *)x.data(), &incx_ref, (fp_ref *)&beta, - (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), (fp_ref*)x.data(), &incx_ref, (fp_ref*)&beta, + (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SPMV. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPMV:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class SpmvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SpmvUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/spr.cpp b/tests/unit_tests/blas/level2/spr.cpp index 4e4b5d8a9..05b809f45 100644 --- a/tests/unit_tests/blas/level2/spr.cpp +++ b/tests/unit_tests/blas/level2/spr.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx) { // Prepare data. vector x, A_ref, A; @@ -60,17 +60,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data()); // Call DPC++ SPR. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPR:\n" << e.what() << std::endl; print_error_code(e); @@ -110,16 +110,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl; } @@ -130,8 +130,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class SprTests : public ::testing::TestWithParam> { -}; +class SprTests : public ::testing::TestWithParam> {}; TEST_P(SprTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/spr2.cpp b/tests/unit_tests/blas/level2/spr2.cpp index d9d00a4e8..bbb232f5c 100644 --- a/tests/unit_tests/blas/level2/spr2.cpp +++ b/tests/unit_tests/blas/level2/spr2.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Prepare data. vector x, y, A_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data()); // Call DPC++ SPR2. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPR2:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class Spr2Tests : public ::testing::TestWithParam> { +class Spr2Tests : public ::testing::TestWithParam> { }; TEST_P(Spr2Tests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/spr2_usm.cpp b/tests/unit_tests/blas/level2/spr2_usm.cpp index 683288775..4a029015f 100644 --- a/tests/unit_tests/blas/level2/spr2_usm.cpp +++ b/tests/unit_tests/blas/level2/spr2_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPR2:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data()); // Call DPC++ SPR2. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPR2:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class Spr2UsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Spr2UsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/spr_usm.cpp b/tests/unit_tests/blas/level2/spr_usm.cpp index 3a23a33b4..e81aa41d9 100644 --- a/tests/unit_tests/blas/level2/spr_usm.cpp +++ b/tests/unit_tests/blas/level2/spr_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SPR:\n" << e.what() << std::endl; print_error_code(e); @@ -81,7 +81,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::spr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data()); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data()); // Call DPC++ SPR. @@ -115,16 +115,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SPR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SPR:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class SprUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SprUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/symv.cpp b/tests/unit_tests/blas/level2/symv.cpp index a22e48ff7..fb33d8914 100644 --- a/tests/unit_tests/blas/level2/symv.cpp +++ b/tests/unit_tests/blas/level2/symv.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Prepare data. vector x, y, y_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::symv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SYMV. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYMV:\n" << e.what() << std::endl; print_error_code(e); @@ -115,16 +115,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl; } @@ -135,7 +135,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class SymvTests : public ::testing::TestWithParam> { +class SymvTests : public ::testing::TestWithParam> { }; TEST_P(SymvTests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/symv_usm.cpp b/tests/unit_tests/blas/level2/symv_usm.cpp index f33c0d25f..8cfff4f39 100644 --- a/tests/unit_tests/blas/level2/symv_usm.cpp +++ b/tests/unit_tests/blas/level2/symv_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, fp beta, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYMV:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::symv(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)A.data(), &lda_ref, (fp_ref *)x.data(), &incx_ref, - (fp_ref *)&beta, (fp_ref *)y_ref.data(), &incy_ref); + (fp_ref*)&alpha, (fp_ref*)A.data(), &lda_ref, (fp_ref*)x.data(), &incx_ref, + (fp_ref*)&beta, (fp_ref*)y_ref.data(), &incy_ref); // Call DPC++ SYMV. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYMV:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYMV:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class SymvUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SymvUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/syr.cpp b/tests/unit_tests/blas/level2/syr.cpp index 6b305582b..f382749da 100644 --- a/tests/unit_tests/blas/level2/syr.cpp +++ b/tests/unit_tests/blas/level2/syr.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int lda) { // Prepare data. vector x, A_ref, A; @@ -60,17 +60,17 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::syr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ SYR. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYR:\n" << e.what() << std::endl; print_error_code(e); @@ -110,16 +110,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl; } @@ -130,8 +130,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class SyrTests : public ::testing::TestWithParam> { -}; +class SyrTests : public ::testing::TestWithParam> {}; TEST_P(SyrTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/syr2.cpp b/tests/unit_tests/blas/level2/syr2.cpp index 5da1e0106..ef96572e5 100644 --- a/tests/unit_tests/blas/level2/syr2.cpp +++ b/tests/unit_tests/blas/level2/syr2.cpp @@ -42,12 +42,12 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy, int lda) { // Prepare data. vector x, y, A_ref, A; @@ -61,18 +61,18 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::syr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ SYR2. // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYR2:\n" << e.what() << std::endl; print_error_code(e); @@ -114,16 +114,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl; } @@ -134,7 +134,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, return (int)good; } -class Syr2Tests : public ::testing::TestWithParam> { +class Syr2Tests : public ::testing::TestWithParam> { }; TEST_P(Syr2Tests, RealSinglePrecision) { diff --git a/tests/unit_tests/blas/level2/syr2_usm.cpp b/tests/unit_tests/blas/level2/syr2_usm.cpp index a1e2cba7d..64db524f6 100644 --- a/tests/unit_tests/blas/level2/syr2_usm.cpp +++ b/tests/unit_tests/blas/level2/syr2_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int incy, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYR2:\n" << e.what() << std::endl; print_error_code(e); @@ -82,8 +82,8 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::syr2(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)y.data(), &incy_ref, - (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)y.data(), &incy_ref, + (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ SYR2. @@ -120,16 +120,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYR2:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYR2:\n" << error.what() << std::endl; } @@ -141,7 +141,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class Syr2UsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(Syr2UsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/blas/level2/syr_usm.cpp b/tests/unit_tests/blas/level2/syr_usm.cpp index 5a9f5034d..c6b652d24 100644 --- a/tests/unit_tests/blas/level2/syr_usm.cpp +++ b/tests/unit_tests/blas/level2/syr_usm.cpp @@ -42,20 +42,20 @@ using namespace sycl; using std::vector; -extern std::vector devices; +extern std::vector devices; namespace { template -int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, +int test(device* dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, int n, fp alpha, int incx, int lda) { // Catch asynchronous exceptions. auto exception_handler = [](exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught asynchronous SYCL exception during SYR:\n" << e.what() << std::endl; print_error_code(e); @@ -81,7 +81,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, using fp_ref = typename ref_type_info::type; ::syr(convert_to_cblas_layout(layout), convert_to_cblas_uplo(upper_lower), &n_ref, - (fp_ref *)&alpha, (fp_ref *)x.data(), &incx_ref, (fp_ref *)A_ref.data(), &lda_ref); + (fp_ref*)&alpha, (fp_ref*)x.data(), &incx_ref, (fp_ref*)A_ref.data(), &lda_ref); // Call DPC++ SYR. @@ -115,16 +115,16 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, main_queue.wait(); #endif } - catch (exception const &e) { + catch (exception const& e) { std::cout << "Caught synchronous SYCL exception during SYR:\n" << e.what() << std::endl; print_error_code(e); } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of SYR:\n" << error.what() << std::endl; } @@ -136,7 +136,7 @@ int test(device *dev, oneapi::mkl::layout layout, oneapi::mkl::uplo upper_lower, } class SyrUsmTests - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; TEST_P(SyrUsmTests, RealSinglePrecision) { float alpha(2.0); diff --git a/tests/unit_tests/dft/include/compute_inplace.hpp b/tests/unit_tests/dft/include/compute_inplace.hpp index 9cc161c34..95421a232 100644 --- a/tests/unit_tests/dft/include/compute_inplace.hpp +++ b/tests/unit_tests/dft/include/compute_inplace.hpp @@ -94,9 +94,12 @@ int DFT_Test::test_in_place_buffer() { auto acc_host = inout_buf.get_host_access(); auto ptr_host = reinterpret_cast(acc_host.get_pointer()); for (std::int64_t i = 0; i < batches; i++) { - EXPECT_TRUE(check_equal_strided( - ptr_host + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes, - modified_strides_bwd, abs_error_margin, rel_error_margin, std::cout)); + EXPECT_TRUE(check_equal_strided < domain == + oneapi::mkl::dft::domain::REAL > + (ptr_host + backward_distance * i, + out_host_ref.data() + ref_distance * i, sizes, + modified_strides_bwd, abs_error_margin, rel_error_margin, + std::cout)); } } @@ -188,10 +191,11 @@ int DFT_Test::test_in_place_USM() { .wait_and_throw(); for (std::int64_t i = 0; i < batches; i++) { - EXPECT_TRUE(check_equal_strided( - reinterpret_cast(inout.data()) + backward_distance * i, - out_host_ref.data() + ref_distance * i, sizes, modified_strides_bwd, abs_error_margin, - rel_error_margin, std::cout)); + EXPECT_TRUE(check_equal_strided < domain == + oneapi::mkl::dft::domain::REAL > + (reinterpret_cast(inout.data()) + backward_distance * i, + out_host_ref.data() + ref_distance * i, sizes, modified_strides_bwd, + abs_error_margin, rel_error_margin, std::cout)); } sycl::event done = diff --git a/tests/unit_tests/dft/include/compute_out_of_place.hpp b/tests/unit_tests/dft/include/compute_out_of_place.hpp index bcfd09dda..0d2041dc1 100644 --- a/tests/unit_tests/dft/include/compute_out_of_place.hpp +++ b/tests/unit_tests/dft/include/compute_out_of_place.hpp @@ -77,9 +77,11 @@ int DFT_Test::test_out_of_place_buffer() { auto acc_bwd = bwd_buf.get_host_access(); auto bwd_ptr = acc_bwd.get_pointer(); for (std::int64_t i = 0; i < batches; i++) { - EXPECT_TRUE(check_equal_strided( - bwd_ptr + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes, - strides_bwd_cpy, abs_error_margin, rel_error_margin, std::cout)); + EXPECT_TRUE(check_equal_strided < domain == + oneapi::mkl::dft::domain::REAL > + (bwd_ptr + backward_distance * i, + out_host_ref.data() + ref_distance * i, sizes, strides_bwd_cpy, + abs_error_margin, rel_error_margin, std::cout)); } } @@ -90,7 +92,7 @@ int DFT_Test::test_out_of_place_buffer() { // account for scaling that occurs during DFT std::for_each(input.begin(), input.end(), - [this](auto &x) { x *= static_cast(forward_elements); }); + [this](auto& x) { x *= static_cast(forward_elements); }); for (std::int64_t i = 0; i < batches; i++) { EXPECT_TRUE(check_equal_strided( @@ -164,9 +166,10 @@ int DFT_Test::test_out_of_place_USM() { auto bwd_ptr = &bwd[0]; for (std::int64_t i = 0; i < batches; i++) { - EXPECT_TRUE(check_equal_strided( - bwd_ptr + backward_distance * i, out_host_ref.data() + ref_distance * i, sizes, - strides_bwd_cpy, abs_error_margin, rel_error_margin, std::cout)); + EXPECT_TRUE(check_equal_strided < domain == + oneapi::mkl::dft::domain::REAL > + (bwd_ptr + backward_distance * i, out_host_ref.data() + ref_distance * i, + sizes, strides_bwd_cpy, abs_error_margin, rel_error_margin, std::cout)); } oneapi::mkl::dft::compute_backward, FwdOutputType, @@ -176,7 +179,7 @@ int DFT_Test::test_out_of_place_USM() { // account for scaling that occurs during DFT std::for_each(input.begin(), input.end(), - [this](auto &x) { x *= static_cast(forward_elements); }); + [this](auto& x) { x *= static_cast(forward_elements); }); for (std::int64_t i = 0; i < batches; i++) { EXPECT_TRUE(check_equal_strided( diff --git a/tests/unit_tests/dft/include/reference_dft.hpp b/tests/unit_tests/dft/include/reference_dft.hpp index 236edc7b0..7114306c6 100644 --- a/tests/unit_tests/dft/include/reference_dft.hpp +++ b/tests/unit_tests/dft/include/reference_dft.hpp @@ -32,7 +32,7 @@ namespace detail { using ref_t = long double; /* Do the calculations using long double */ template -void reference_forward_dft_impl(const TypeIn *in, TypeOut *out, std::size_t N, std::size_t stride) { +void reference_forward_dft_impl(const TypeIn* in, TypeOut* out, std::size_t N, std::size_t stride) { static_assert(is_complex(), "Output type of DFT must be complex"); constexpr ref_t TWOPI = 2.0L * 3.141592653589793238462643383279502884197L; @@ -54,14 +54,14 @@ struct reference {}; template struct reference { - static void forward_dft(const std::vector &sizes, const TypeIn *in, TypeOut *out) { + static void forward_dft(const std::vector& sizes, const TypeIn* in, TypeOut* out) { reference_forward_dft_impl(in, out, sizes[0], 1); } }; template struct reference { - static void forward_dft(const std::vector &sizes, const TypeIn *in, TypeOut *out) { + static void forward_dft(const std::vector& sizes, const TypeIn* in, TypeOut* out) { const auto elements = std::accumulate(sizes.begin(), sizes.end(), 1U, std::multiplies<>{}); std::vector> tmp(elements); for (std::size_t i = 0; i < elements; i += sizes[1]) { @@ -75,7 +75,7 @@ struct reference { template struct reference { - static void forward_dft(const std::vector &sizes, const TypeIn *in, TypeOut *out) { + static void forward_dft(const std::vector& sizes, const TypeIn* in, TypeOut* out) { const auto elements = std::accumulate(sizes.begin(), sizes.end(), 1U, std::multiplies<>{}); std::vector> tmp1(elements); std::vector> tmp2(elements); @@ -112,7 +112,7 @@ struct reference { * @param stride the stride between elements in the data set, measured in elements. **/ template -void reference_forward_dft(const std::vector &sizes, const TypeIn *in, TypeOut *out) { +void reference_forward_dft(const std::vector& sizes, const TypeIn* in, TypeOut* out) { std::vector unsigned_sizes(sizes.size()); std::transform(sizes.begin(), sizes.end(), unsigned_sizes.begin(), [](std::int64_t size) { return cast_unsigned(size); }); diff --git a/tests/unit_tests/dft/include/test_common.hpp b/tests/unit_tests/dft/include/test_common.hpp index b13723105..4410bdeb2 100644 --- a/tests/unit_tests/dft/include/test_common.hpp +++ b/tests/unit_tests/dft/include/test_common.hpp @@ -58,7 +58,7 @@ inline std::size_t cast_unsigned(std::int64_t i) { } template -bool check_equal(fp x, fp x_ref, double abs_error_mag, double rel_error_mag, std::ostream &out) { +bool check_equal(fp x, fp x_ref, double abs_error_mag, double rel_error_mag, std::ostream& out) { using fp_real = typename complex_info::real_type; static_assert(std::is_floating_point_v, "Expected floating-point real or complex type."); @@ -88,8 +88,8 @@ bool check_equal(fp x, fp x_ref, double abs_error_mag, double rel_error_mag, std } template -bool check_equal_vector(vec1 &&v, vec2 &&v_ref, std::size_t n, double abs_error_mag, - double rel_error_mag, std::ostream &out) { +bool check_equal_vector(vec1&& v, vec2&& v_ref, std::size_t n, double abs_error_mag, + double rel_error_mag, std::ostream& out) { constexpr int max_print = 20; int count = 0; bool good = true; @@ -131,7 +131,7 @@ inline t rand_scalar() { } template -void rand_vector(vec &v, std::size_t n) { +void rand_vector(vec& v, std::size_t n) { using fp = typename vec::value_type; v.resize(n); for (std::size_t i = 0; i < n; i++) { @@ -141,7 +141,7 @@ void rand_vector(vec &v, std::size_t n) { // Catch asynchronous exceptions. auto exception_handler = [](sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } @@ -153,7 +153,7 @@ auto exception_handler = [](sycl::exception_list exceptions) { }; template -void commit_descriptor(oneapi::mkl::dft::descriptor &descriptor, +void commit_descriptor(oneapi::mkl::dft::descriptor& descriptor, sycl::queue queue) { #ifdef CALL_RT_API descriptor.commit(queue); @@ -164,7 +164,7 @@ void commit_descriptor(oneapi::mkl::dft::descriptor &descript // is it assumed that the unused elements of the array are ignored inline std::array get_conjugate_even_complex_strides( - const std::vector &sizes) { + const std::vector& sizes) { switch (sizes.size()) { case 1: return { 0, 1 }; case 2: return { 0, sizes[1] / 2 + 1, 1 }; @@ -178,7 +178,7 @@ inline std::array get_conjugate_even_complex_strides( } // is it assumed that the unused elements of the array are ignored -inline std::array get_default_strides(const std::vector &sizes) { +inline std::array get_default_strides(const std::vector& sizes) { if (sizes.size() > 3) { throw oneapi::mkl::unimplemented( "dft/test_common", __FUNCTION__, @@ -207,8 +207,8 @@ T get_default(const std::vector vec, std::size_t idx, T default_) { template std::pair get_default_distances( - const std::vector &sizes, const std::vector &strides_fwd, - const std::vector &strides_bwd) { + const std::vector& sizes, const std::vector& strides_fwd, + const std::vector& strides_bwd) { std::int64_t size0 = sizes[0]; std::int64_t size1 = get_default(sizes, 1, 1l); std::int64_t size2 = get_default(sizes, 2, 1l); @@ -241,8 +241,8 @@ std::pair get_default_distances( //up to 3 dimensions, empty strides = default template > std::vector strided_copy( - const T_vec &contiguous, const std::vector &sizes, - const std::vector &strides, std::int64_t batches, std::int64_t distance, + const T_vec& contiguous, const std::vector& sizes, + const std::vector& strides, std::int64_t batches, std::int64_t distance, Allocator alloc = {}) { if (strides.size() == 0) { return { contiguous.begin(), contiguous.end(), alloc }; @@ -273,9 +273,9 @@ std::vector strided_copy( //up to 3 dimensions, empty strides = default template -bool check_equal_strided(const vec1 &v, const vec2 &v_ref, std::vector sizes, +bool check_equal_strided(const vec1& v, const vec2& v_ref, std::vector sizes, std::vector strides, double abs_error_mag, double rel_error_mag, - std::ostream &out) { + std::ostream& out) { if (strides.size() == 0) { std::array strides_arr; if constexpr (ConjugateEvenStrides) { @@ -344,8 +344,7 @@ struct DFTParams { class DFTParamsPrint { public: - std::string operator()( - testing::TestParamInfo> dev) const { + std::string operator()(testing::TestParamInfo> dev) const { auto [device, params] = dev.param; std::string info_name; @@ -377,7 +376,7 @@ class DFTParamsPrint { info_name.append("_batches_").append(std::to_string(params.batches)); std::string dev_name = device->get_info(); - std::for_each(dev_name.begin(), dev_name.end(), [](auto &c) { + std::for_each(dev_name.begin(), dev_name.end(), [](auto& c) { if (!isalnum(c)) c = '_'; }); diff --git a/tests/unit_tests/dft/source/compute_tests.cpp b/tests/unit_tests/dft/source/compute_tests.cpp index 005f833ef..c28c46389 100644 --- a/tests/unit_tests/dft/source/compute_tests.cpp +++ b/tests/unit_tests/dft/source/compute_tests.cpp @@ -35,27 +35,27 @@ #include "compute_out_of_place.hpp" #include "compute_out_of_place_real_real.hpp" -extern std::vector devices; +extern std::vector devices; namespace { class ComputeTests_in_place_COMPLEX - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_real_real_in_place_COMPLEX - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_out_of_place_COMPLEX - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_real_real_out_of_place_COMPLEX - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_in_place_REAL - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_real_real_in_place_REAL - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_out_of_place_REAL - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; class ComputeTests_real_real_out_of_place_REAL - : public ::testing::TestWithParam> {}; + : public ::testing::TestWithParam> {}; #define INSTANTIATE_TEST(PRECISION, DOMAIN, PLACE, LAYOUT, STORAGE) \ TEST_P(ComputeTests##_##LAYOUT##PLACE##_##DOMAIN, \ diff --git a/tests/unit_tests/include/test_helper.hpp b/tests/unit_tests/include/test_helper.hpp index 5457079e0..30e623ef1 100644 --- a/tests/unit_tests/include/test_helper.hpp +++ b/tests/unit_tests/include/test_helper.hpp @@ -281,11 +281,11 @@ } \ } while (0); -void print_error_code(sycl::exception const &e); +void print_error_code(sycl::exception const& e); class DeviceNamePrint { public: - std::string operator()(testing::TestParamInfo dev) const { + std::string operator()(testing::TestParamInfo dev) const { std::string dev_name = dev.param->get_info(); for (std::string::size_type i = 0; i < dev_name.size(); ++i) { if (!isalnum(dev_name[i])) @@ -300,7 +300,7 @@ class DeviceNamePrint { class LayoutDeviceNamePrint { public: std::string operator()( - testing::TestParamInfo> dev) const { + testing::TestParamInfo> dev) const { std::string layout_name = std::get<1>(dev.param) == oneapi::mkl::layout::col_major ? "Column_Major" : "Row_Major"; std::string dev_name = std::get<0>(dev.param)->get_info(); @@ -318,7 +318,7 @@ class LayoutDeviceNamePrint { namespace oneapi { namespace mkl { -static inline void *aligned_alloc(size_t align, size_t size) { +static inline void* aligned_alloc(size_t align, size_t size) { #ifdef _WIN64 return ::_aligned_malloc(size, align); #else @@ -326,7 +326,7 @@ static inline void *aligned_alloc(size_t align, size_t size) { #endif } -static inline void aligned_free(void *p) { +static inline void aligned_free(void* p) { #ifdef _WIN64 ::_aligned_free(p); #else @@ -335,7 +335,7 @@ static inline void aligned_free(void *p) { } /* Support for Unified Shared Memory allocations for different backends */ -static inline void *malloc_shared(size_t align, size_t size, sycl::device dev, sycl::context ctx) { +static inline void* malloc_shared(size_t align, size_t size, sycl::device dev, sycl::context ctx) { (void)align; #ifdef _WIN64 return sycl::malloc_shared(size, dev, ctx); @@ -349,7 +349,7 @@ static inline void *malloc_shared(size_t align, size_t size, sycl::device dev, s #endif } -static inline void *malloc_device(size_t align, size_t size, sycl::device dev, sycl::context ctx) { +static inline void* malloc_device(size_t align, size_t size, sycl::device dev, sycl::context ctx) { (void)align; #ifdef _WIN64 return sycl::malloc_device(size, dev, ctx); @@ -363,11 +363,11 @@ static inline void *malloc_device(size_t align, size_t size, sycl::device dev, s #endif } -static inline void free_shared(void *p, sycl::context ctx) { +static inline void free_shared(void* p, sycl::context ctx) { sycl::free(p, ctx); } -static inline void free_usm(void *p, sycl::context ctx) { +static inline void free_usm(void* p, sycl::context ctx) { sycl::free(p, ctx); } diff --git a/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp b/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp index cb09ec16a..07ce554e8 100644 --- a/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp +++ b/tests/unit_tests/lapack/include/lapack_reference_wrappers.hpp @@ -62,7 +62,7 @@ inline CBLAS_DIAG cblas_diag(oneapi::mkl::diag d) { return CblasUnit; return CblasNonUnit; } -inline CBLAS_SIDE cblas_side(const char *c) { +inline CBLAS_SIDE cblas_side(const char* c) { return *c == 'R' || *c == 'r' ? CblasRight : CblasLeft; } inline CBLAS_SIDE cblas_side(oneapi::mkl::side s) { @@ -150,142 +150,142 @@ inline char to_char(oneapi::mkl::generate v) { } inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n, - int64_t k, float alpha, const float *a, int64_t lda, const float *b, int64_t ldb, - float beta, float *c, int64_t ldc) { + int64_t k, float alpha, const float* a, int64_t lda, const float* b, int64_t ldb, + float beta, float* c, int64_t ldc) { cblas_sgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n, - int64_t k, double alpha, const double *a, int64_t lda, const double *b, - int64_t ldb, double beta, double *c, int64_t ldc) { + int64_t k, double alpha, const double* a, int64_t lda, const double* b, + int64_t ldb, double beta, double* c, int64_t ldc) { cblas_dgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc) { - cblas_cgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void *)&alpha, - (void *)a, lda, (void *)(b), ldb, (void *)&beta, (void *)c, ldc); + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc) { + cblas_cgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void*)&alpha, + (void*)a, lda, (void*)(b), ldb, (void*)&beta, (void*)c, ldc); } inline void gemm(oneapi::mkl::transpose transa, oneapi::mkl::transpose transb, int64_t m, int64_t n, - int64_t k, std::complex alpha, const std::complex *a, int64_t lda, - const std::complex *b, int64_t ldb, std::complex beta, - std::complex *c, int64_t ldc) { - cblas_zgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void *)&alpha, - (void *)a, lda, (void *)(b), ldb, (void *)&beta, (void *)c, ldc); + int64_t k, std::complex alpha, const std::complex* a, int64_t lda, + const std::complex* b, int64_t ldb, std::complex beta, + std::complex* c, int64_t ldc) { + cblas_zgemm(CblasColMajor, cblas_trans(transa), cblas_trans(transb), m, n, k, (void*)&alpha, + (void*)a, lda, (void*)(b), ldb, (void*)&beta, (void*)c, ldc); } -inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, - float *w) { +inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float* a, int64_t lda, + float* w) { return LAPACKE_ssyevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, a, lda, w); } -inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, - double *w) { +inline int64_t syevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double* a, int64_t lda, + double* w) { return LAPACKE_dsyevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, a, lda, w); } -inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float *a, - int64_t lda, float *b, int64_t ldb, float *w) { +inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, float* a, + int64_t lda, float* b, int64_t ldb, float* w) { return LAPACKE_ssygvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, a, lda, b, ldb, w); } -inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double *a, - int64_t lda, double *b, int64_t ldb, double *w) { +inline int64_t sygvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, double* a, + int64_t lda, double* b, int64_t ldb, double* w) { return LAPACKE_dsygvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, a, lda, b, ldb, w); } inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, float alpha, - const float *a, int64_t lda, float beta, float *c, int64_t ldc) { + const float* a, int64_t lda, float beta, float* c, int64_t ldc) { cblas_ssyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc); } inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, double alpha, - const double *a, int64_t lda, double beta, double *c, int64_t ldc) { + const double* a, int64_t lda, double beta, double* c, int64_t ldc) { cblas_dsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc); } inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc) { - cblas_csyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void *)&alpha, a, lda, - (void *)&beta, (void *)c, ldc); + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc) { + cblas_csyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void*)&alpha, a, lda, + (void*)&beta, (void*)c, ldc); } inline void syrk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - std::complex alpha, const std::complex *a, int64_t lda, - std::complex beta, std::complex *c, int64_t ldc) { - cblas_zsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void *)&alpha, a, lda, - (void *)&beta, (void *)c, ldc); + std::complex alpha, const std::complex* a, int64_t lda, + std::complex beta, std::complex* c, int64_t ldc) { + cblas_zsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, (void*)&alpha, a, lda, + (void*)&beta, (void*)c, ldc); } inline void herk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, float alpha, - const std::complex *a, int64_t lda, float beta, std::complex *c, + const std::complex* a, int64_t lda, float beta, std::complex* c, int64_t ldc) { - cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c, + cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void*)c, ldc); } inline void herk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, double alpha, - const std::complex *a, int64_t lda, double beta, std::complex *c, + const std::complex* a, int64_t lda, double beta, std::complex* c, int64_t ldc) { - cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c, + cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void*)c, ldc); } inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - float alpha, const float *a, int64_t lda, float beta, float *c, int64_t ldc) { + float alpha, const float* a, int64_t lda, float beta, float* c, int64_t ldc) { cblas_ssyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc); } inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - double alpha, const double *a, int64_t lda, double beta, double *c, + double alpha, const double* a, int64_t lda, double beta, double* c, int64_t ldc) { cblas_dsyrk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, c, ldc); } inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - float alpha, const std::complex *a, int64_t lda, float beta, - std::complex *c, int64_t ldc) { - cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c, + float alpha, const std::complex* a, int64_t lda, float beta, + std::complex* c, int64_t ldc) { + cblas_cherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void*)c, ldc); } inline void sy_he_rk(oneapi::mkl::uplo u, oneapi::mkl::transpose t, int64_t n, int64_t k, - double alpha, const std::complex *a, int64_t lda, double beta, - std::complex *c, int64_t ldc) { - cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void *)c, + double alpha, const std::complex* a, int64_t lda, double beta, + std::complex* c, int64_t ldc) { + cblas_zherk(CblasColMajor, cblas_uplo(u), cblas_trans(t), n, k, alpha, a, lda, beta, (void*)c, ldc); } inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa, - oneapi::mkl::diag diag, int64_t m, int64_t n, float alpha, const float *a, - int64_t lda, float *b, int64_t ldb) { + oneapi::mkl::diag diag, int64_t m, int64_t n, float alpha, const float* a, + int64_t lda, float* b, int64_t ldb) { cblas_strmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa), cblas_diag(diag), m, n, alpha, a, lda, b, ldb); } inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa, - oneapi::mkl::diag diag, int64_t m, int64_t n, double alpha, const double *a, - int64_t lda, double *b, int64_t ldb) { + oneapi::mkl::diag diag, int64_t m, int64_t n, double alpha, const double* a, + int64_t lda, double* b, int64_t ldb) { cblas_dtrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa), cblas_diag(diag), m, n, alpha, a, lda, b, ldb); } inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa, oneapi::mkl::diag diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb) { cblas_ctrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa), - cblas_diag(diag), m, n, (void *)&alpha, (void *)(a), lda, (void *)(b), ldb); + cblas_diag(diag), m, n, (void*)&alpha, (void*)(a), lda, (void*)(b), ldb); } inline void trmm(oneapi::mkl::side side, oneapi::mkl::uplo uplo, oneapi::mkl::transpose transa, oneapi::mkl::diag diag, int64_t m, int64_t n, std::complex alpha, - const std::complex *a, int64_t lda, std::complex *b, int64_t ldb) { + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb) { cblas_ztrmm(CblasColMajor, cblas_side(side), cblas_uplo(uplo), cblas_trans(transa), - cblas_diag(diag), m, n, (void *)&alpha, (void *)(a), lda, (void *)(b), ldb); + cblas_diag(diag), m, n, (void*)&alpha, (void*)(a), lda, (void*)(b), ldb); } -inline void swap(int64_t n, float *X, int64_t incX, float *Y, int64_t incY) { +inline void swap(int64_t n, float* X, int64_t incX, float* Y, int64_t incY) { cblas_sswap(n, X, incX, Y, incY); } -inline void swap(int64_t n, double *X, int64_t incX, double *Y, int64_t incY) { +inline void swap(int64_t n, double* X, int64_t incX, double* Y, int64_t incY) { cblas_dswap(n, X, incX, Y, incY); } -inline void swap(int64_t n, std::complex *X, int64_t incX, std::complex *Y, +inline void swap(int64_t n, std::complex* X, int64_t incX, std::complex* Y, int64_t incY) { - cblas_cswap(n, (void *)X, incX, (void *)Y, incY); + cblas_cswap(n, (void*)X, incX, (void*)Y, incY); } -inline void swap(int64_t n, std::complex *X, int64_t incX, std::complex *Y, +inline void swap(int64_t n, std::complex* X, int64_t incX, std::complex* Y, int64_t incY) { - cblas_zswap(n, (void *)X, incX, (void *)Y, incY); + cblas_zswap(n, (void*)X, incX, (void*)Y, incY); } template @@ -299,608 +299,607 @@ inline double lamch(char cmach) { return LAPACKE_dlamch(cmach); } -inline float lange(char norm, int64_t m, int64_t n, const std::complex *a, int64_t lda) { +inline float lange(char norm, int64_t m, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_clange(LAPACK_COL_MAJOR, norm, m, n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline double lange(char norm, int64_t m, int64_t n, const double *a, int64_t lda) { +inline double lange(char norm, int64_t m, int64_t n, const double* a, int64_t lda) { return LAPACKE_dlange(LAPACK_COL_MAJOR, norm, m, n, a, lda); } -inline float lange(char norm, int64_t m, int64_t n, const float *a, int64_t lda) { +inline float lange(char norm, int64_t m, int64_t n, const float* a, int64_t lda) { return LAPACKE_slange(LAPACK_COL_MAJOR, norm, m, n, a, lda); } -inline double lange(char norm, int64_t m, int64_t n, const std::complex *a, int64_t lda) { +inline double lange(char norm, int64_t m, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_zlange(LAPACK_COL_MAJOR, norm, m, n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline float lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex *a, +inline float lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_clanhe(LAPACK_COL_MAJOR, norm, to_char(u), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline double lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex *a, +inline double lanhe(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_zlanhe(LAPACK_COL_MAJOR, norm, to_char(u), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex *a, +inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_clansy(LAPACK_COL_MAJOR, norm, to_char(u), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const double *a, int64_t lda) { +inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const double* a, int64_t lda) { return LAPACKE_dlansy(LAPACK_COL_MAJOR, norm, to_char(u), n, a, lda); } -inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const float *a, int64_t lda) { +inline float lansy(char norm, oneapi::mkl::uplo u, int64_t n, const float* a, int64_t lda) { return LAPACKE_slansy(LAPACK_COL_MAJOR, norm, to_char(u), n, a, lda); } -inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex *a, +inline double lansy(char norm, oneapi::mkl::uplo u, int64_t n, const std::complex* a, int64_t lda) { return LAPACKE_zlansy(LAPACK_COL_MAJOR, norm, to_char(u), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb) { +inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb) { return LAPACKE_clacpy(LAPACK_COL_MAJOR, u, m, n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } -inline int64_t lacpy(char u, int64_t m, int64_t n, const double *a, int64_t lda, double *b, +inline int64_t lacpy(char u, int64_t m, int64_t n, const double* a, int64_t lda, double* b, int64_t ldb) { return LAPACKE_dlacpy(LAPACK_COL_MAJOR, u, m, n, a, lda, b, ldb); } -inline int64_t lacpy(char u, int64_t m, int64_t n, const float *a, int64_t lda, float *b, +inline int64_t lacpy(char u, int64_t m, int64_t n, const float* a, int64_t lda, float* b, int64_t ldb) { return LAPACKE_slacpy(LAPACK_COL_MAJOR, u, m, n, a, lda, b, ldb); } -inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb) { +inline int64_t lacpy(char u, int64_t m, int64_t n, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb) { return LAPACKE_zlacpy(LAPACK_COL_MAJOR, u, m, n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } -inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex *a, - int64_t lda, std::complex *b, int64_t ldb) { +inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex* a, + int64_t lda, std::complex* b, int64_t ldb) { return LAPACKE_clacpy(LAPACK_COL_MAJOR, to_char(u), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } -inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const double *a, int64_t lda, - double *b, int64_t ldb) { +inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const double* a, int64_t lda, + double* b, int64_t ldb) { return LAPACKE_dlacpy(LAPACK_COL_MAJOR, to_char(u), m, n, a, lda, b, ldb); } -inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const float *a, int64_t lda, - float *b, int64_t ldb) { +inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const float* a, int64_t lda, + float* b, int64_t ldb) { return LAPACKE_slacpy(LAPACK_COL_MAJOR, to_char(u), m, n, a, lda, b, ldb); } -inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex *a, - int64_t lda, std::complex *b, int64_t ldb) { +inline int64_t lacpy(oneapi::mkl::uplo u, int64_t m, int64_t n, const std::complex* a, + int64_t lda, std::complex* b, int64_t ldb) { return LAPACKE_zlacpy(LAPACK_COL_MAJOR, to_char(u), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, std::complex alpha, - std::complex beta, std::complex *a, int64_t lda) { + std::complex beta, std::complex* a, int64_t lda) { return LAPACKE_claset(LAPACK_COL_MAJOR, to_char(u), m, n, - reinterpret_cast(alpha), - reinterpret_cast(beta), - reinterpret_cast(a), lda); + reinterpret_cast(alpha), + reinterpret_cast(beta), + reinterpret_cast(a), lda); } inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, double alpha, double beta, - double *a, int64_t lda) { + double* a, int64_t lda) { return LAPACKE_dlaset(LAPACK_COL_MAJOR, to_char(u), m, n, alpha, beta, a, lda); } -inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, float alpha, float beta, float *a, +inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, float alpha, float beta, float* a, int64_t lda) { return LAPACKE_slaset(LAPACK_COL_MAJOR, to_char(u), m, n, alpha, beta, a, lda); } inline int64_t laset(oneapi::mkl::uplo u, int64_t m, int64_t n, std::complex alpha, - std::complex beta, std::complex *a, int64_t lda) { + std::complex beta, std::complex* a, int64_t lda) { return LAPACKE_zlaset(LAPACK_COL_MAJOR, to_char(u), m, n, - reinterpret_cast(alpha), - reinterpret_cast(beta), - reinterpret_cast(a), lda); + reinterpret_cast(alpha), + reinterpret_cast(beta), + reinterpret_cast(a), lda); } inline int64_t laset(char u, int64_t m, int64_t n, std::complex alpha, - std::complex beta, std::complex *a, int64_t lda) { - return LAPACKE_claset(LAPACK_COL_MAJOR, u, m, n, - reinterpret_cast(alpha), - reinterpret_cast(beta), - reinterpret_cast(a), lda); + std::complex beta, std::complex* a, int64_t lda) { + return LAPACKE_claset(LAPACK_COL_MAJOR, u, m, n, reinterpret_cast(alpha), + reinterpret_cast(beta), + reinterpret_cast(a), lda); } -inline int64_t laset(char u, int64_t m, int64_t n, double alpha, double beta, double *a, +inline int64_t laset(char u, int64_t m, int64_t n, double alpha, double beta, double* a, int64_t lda) { return LAPACKE_dlaset(LAPACK_COL_MAJOR, u, m, n, alpha, beta, a, lda); } -inline int64_t laset(char u, int64_t m, int64_t n, float alpha, float beta, float *a, int64_t lda) { +inline int64_t laset(char u, int64_t m, int64_t n, float alpha, float beta, float* a, int64_t lda) { return LAPACKE_slaset(LAPACK_COL_MAJOR, u, m, n, alpha, beta, a, lda); } inline int64_t laset(char u, int64_t m, int64_t n, std::complex alpha, - std::complex beta, std::complex *a, int64_t lda) { + std::complex beta, std::complex* a, int64_t lda) { return LAPACKE_zlaset(LAPACK_COL_MAJOR, u, m, n, - reinterpret_cast(alpha), - reinterpret_cast(beta), - reinterpret_cast(a), lda); + reinterpret_cast(alpha), + reinterpret_cast(beta), + reinterpret_cast(a), lda); } -inline int64_t gebrd(int64_t m, int64_t n, std::complex *a, int64_t lda, float *d, float *e, - std::complex *tauq, std::complex *taup) { - return LAPACKE_cgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - d, e, reinterpret_cast(tauq), - reinterpret_cast(taup)); +inline int64_t gebrd(int64_t m, int64_t n, std::complex* a, int64_t lda, float* d, float* e, + std::complex* tauq, std::complex* taup) { + return LAPACKE_cgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + d, e, reinterpret_cast(tauq), + reinterpret_cast(taup)); } -inline int64_t gebrd(int64_t m, int64_t n, double *a, int64_t lda, double *d, double *e, - double *tauq, double *taup) { +inline int64_t gebrd(int64_t m, int64_t n, double* a, int64_t lda, double* d, double* e, + double* tauq, double* taup) { return LAPACKE_dgebrd(LAPACK_COL_MAJOR, m, n, a, lda, d, e, tauq, taup); } -inline int64_t gebrd(int64_t m, int64_t n, float *a, int64_t lda, float *d, float *e, float *tauq, - float *taup) { +inline int64_t gebrd(int64_t m, int64_t n, float* a, int64_t lda, float* d, float* e, float* tauq, + float* taup) { return LAPACKE_sgebrd(LAPACK_COL_MAJOR, m, n, a, lda, d, e, tauq, taup); } -inline int64_t gebrd(int64_t m, int64_t n, std::complex *a, int64_t lda, double *d, - double *e, std::complex *tauq, std::complex *taup) { - return LAPACKE_zgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - d, e, reinterpret_cast(tauq), - reinterpret_cast(taup)); +inline int64_t gebrd(int64_t m, int64_t n, std::complex* a, int64_t lda, double* d, + double* e, std::complex* tauq, std::complex* taup) { + return LAPACKE_zgebrd(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + d, e, reinterpret_cast(tauq), + reinterpret_cast(taup)); } -inline int64_t geqrf(int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau) { - return LAPACKE_cgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(tau)); +inline int64_t geqrf(int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau) { + return LAPACKE_cgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(tau)); } -inline int64_t geqrf(int64_t m, int64_t n, double *a, int64_t lda, double *tau) { +inline int64_t geqrf(int64_t m, int64_t n, double* a, int64_t lda, double* tau) { return LAPACKE_dgeqrf(LAPACK_COL_MAJOR, m, n, a, lda, tau); } -inline int64_t geqrf(int64_t m, int64_t n, float *a, int64_t lda, float *tau) { +inline int64_t geqrf(int64_t m, int64_t n, float* a, int64_t lda, float* tau) { return LAPACKE_sgeqrf(LAPACK_COL_MAJOR, m, n, a, lda, tau); } -inline int64_t geqrf(int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau) { - return LAPACKE_zgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(tau)); +inline int64_t geqrf(int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau) { + return LAPACKE_zgeqrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(tau)); } -inline int64_t gerqf(int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau) { - return LAPACKE_cgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(tau)); +inline int64_t gerqf(int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau) { + return LAPACKE_cgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(tau)); } -inline int64_t gerqf(int64_t m, int64_t n, double *a, int64_t lda, double *tau) { +inline int64_t gerqf(int64_t m, int64_t n, double* a, int64_t lda, double* tau) { return LAPACKE_dgerqf(LAPACK_COL_MAJOR, m, n, a, lda, tau); } -inline int64_t gerqf(int64_t m, int64_t n, float *a, int64_t lda, float *tau) { +inline int64_t gerqf(int64_t m, int64_t n, float* a, int64_t lda, float* tau) { return LAPACKE_sgerqf(LAPACK_COL_MAJOR, m, n, a, lda, tau); } -inline int64_t gerqf(int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau) { - return LAPACKE_zgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(tau)); +inline int64_t gerqf(int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau) { + return LAPACKE_zgerqf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(tau)); } inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n, - std::complex *a, int64_t lda, float *s, std::complex *u, - int64_t ldu, std::complex *vt, int64_t ldvt, float *superb) { + std::complex* a, int64_t lda, float* s, std::complex* u, + int64_t ldu, std::complex* vt, int64_t ldvt, float* superb) { return LAPACKE_cgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, - reinterpret_cast(a), lda, s, - reinterpret_cast(u), ldu, - reinterpret_cast(vt), ldvt, superb); + reinterpret_cast(a), lda, s, + reinterpret_cast(u), ldu, + reinterpret_cast(vt), ldvt, superb); } inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n, - double *a, int64_t lda, double *s, double *u, int64_t ldu, double *vt, - int64_t ldvt, double *superb) { + double* a, int64_t lda, double* s, double* u, int64_t ldu, double* vt, + int64_t ldvt, double* superb) { return LAPACKE_dgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, a, lda, s, u, ldu, vt, ldvt, superb); } inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n, - float *a, int64_t lda, float *s, float *u, int64_t ldu, float *vt, - int64_t ldvt, float *superb) { + float* a, int64_t lda, float* s, float* u, int64_t ldu, float* vt, + int64_t ldvt, float* superb) { return LAPACKE_sgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, a, lda, s, u, ldu, vt, ldvt, superb); } inline int64_t gesvd(oneapi::mkl::jobsvd jobu, oneapi::mkl::jobsvd jobvt, int64_t m, int64_t n, - std::complex *a, int64_t lda, double *s, std::complex *u, - int64_t ldu, std::complex *vt, int64_t ldvt, double *superb) { + std::complex* a, int64_t lda, double* s, std::complex* u, + int64_t ldu, std::complex* vt, int64_t ldvt, double* superb) { return LAPACKE_zgesvd(LAPACK_COL_MAJOR, to_char(jobu), to_char(jobvt), m, n, - reinterpret_cast(a), lda, s, - reinterpret_cast(u), ldu, - reinterpret_cast(vt), ldvt, superb); + reinterpret_cast(a), lda, s, + reinterpret_cast(u), ldu, + reinterpret_cast(vt), ldvt, superb); } -inline int64_t getrf(int64_t m, int64_t n, std::complex *a, int64_t lda, int64_t *ipiv) { - return LAPACKE_cgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); +inline int64_t getrf(int64_t m, int64_t n, std::complex* a, int64_t lda, int64_t* ipiv) { + return LAPACKE_cgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline int64_t getrf(int64_t m, int64_t n, double *a, int64_t lda, int64_t *ipiv) { - return LAPACKE_dgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast(ipiv)); +inline int64_t getrf(int64_t m, int64_t n, double* a, int64_t lda, int64_t* ipiv) { + return LAPACKE_dgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast(ipiv)); } -inline int64_t getrf(int64_t m, int64_t n, float *a, int64_t lda, int64_t *ipiv) { - return LAPACKE_sgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast(ipiv)); +inline int64_t getrf(int64_t m, int64_t n, float* a, int64_t lda, int64_t* ipiv) { + return LAPACKE_sgetrf(LAPACK_COL_MAJOR, m, n, a, lda, reinterpret_cast(ipiv)); } -inline int64_t getrf(int64_t m, int64_t n, std::complex *a, int64_t lda, int64_t *ipiv) { - return LAPACKE_zgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); +inline int64_t getrf(int64_t m, int64_t n, std::complex* a, int64_t lda, int64_t* ipiv) { + return LAPACKE_zgetrf(LAPACK_COL_MAJOR, m, n, reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex *a, - int64_t lda, float *w) { +inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex* a, + int64_t lda, float* w) { return LAPACKE_cheevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, - reinterpret_cast(a), lda, w); + reinterpret_cast(a), lda, w); } -inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex *a, - int64_t lda, double *w) { +inline int64_t heevd(oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, std::complex* a, + int64_t lda, double* w) { return LAPACKE_zheevd(LAPACK_COL_MAJOR, to_char(j), to_char(u), n, - reinterpret_cast(a), lda, w); + reinterpret_cast(a), lda, w); } inline int64_t hegvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, - std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - float *w) { + std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + float* w) { return LAPACKE_chegvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb, w); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb, w); } inline int64_t hegvd(int64_t itype, oneapi::mkl::job j, oneapi::mkl::uplo u, int64_t n, - std::complex *a, int64_t lda, std::complex *b, int64_t ldb, - double *w) { + std::complex* a, int64_t lda, std::complex* b, int64_t ldb, + double* w) { return LAPACKE_zhegvd(LAPACK_COL_MAJOR, itype, to_char(j), to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb, w); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb, w); } -inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, float *d, - float *e, std::complex *tau) { +inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, float* d, + float* e, std::complex* tau) { return LAPACKE_chetrd(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, d, e, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, d, e, + reinterpret_cast(tau)); } -inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - double *d, double *e, std::complex *tau) { +inline int64_t hetrd(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + double* d, double* e, std::complex* tau) { return LAPACKE_zhetrd(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, d, e, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, d, e, + reinterpret_cast(tau)); } -inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - int64_t *ipiv) { +inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + int64_t* ipiv) { return LAPACKE_chetrf(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); + reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - int64_t *ipiv) { +inline int64_t hetrf(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + int64_t* ipiv) { return LAPACKE_zhetrf(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); + reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - const std::complex *tau) { +inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + const std::complex* tau) { return LAPACKE_cungtr(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, + reinterpret_cast(tau)); } -inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - const std::complex *tau) { +inline int64_t ungtr(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + const std::complex* tau) { return LAPACKE_zungtr(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, + reinterpret_cast(tau)); } inline int64_t unmtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t m, int64_t n, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_cunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } inline int64_t unmtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t m, int64_t n, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_zunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } -inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, const double *tau) { +inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, double* a, int64_t lda, const double* tau) { return LAPACKE_dorgtr(LAPACK_COL_MAJOR, to_char(u), n, a, lda, tau); } -inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, const float *tau) { +inline int64_t orgtr(oneapi::mkl::uplo u, int64_t n, float* a, int64_t lda, const float* tau) { return LAPACKE_sorgtr(LAPACK_COL_MAJOR, to_char(u), n, a, lda, tau); } inline int64_t ormtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, float *a, int64_t lda, const float *tau, float *c, + int64_t m, int64_t n, float* a, int64_t lda, const float* tau, float* c, int64_t ldc) { return LAPACKE_sormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda, tau, c, ldc); } inline int64_t ormtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, double *a, int64_t lda, const double *tau, double *c, + int64_t m, int64_t n, double* a, int64_t lda, const double* tau, double* c, int64_t ldc) { return LAPACKE_dormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda, tau, c, ldc); } inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, float *a, int64_t lda, const float *tau, float *c, + int64_t m, int64_t n, float* a, int64_t lda, const float* tau, float* c, int64_t ldc) { return LAPACKE_sormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda, tau, c, ldc); } inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, double *a, int64_t lda, const double *tau, double *c, + int64_t m, int64_t n, double* a, int64_t lda, const double* tau, double* c, int64_t ldc) { return LAPACKE_dormtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, a, lda, tau, c, ldc); } inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau, std::complex *c, int64_t ldc) { + int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_cunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } inline int64_t or_un_mtr(oneapi::mkl::side side, oneapi::mkl::uplo u, oneapi::mkl::transpose trans, - int64_t m, int64_t n, std::complex *a, int64_t lda, - std::complex *tau, std::complex *c, int64_t ldc) { + int64_t m, int64_t n, std::complex* a, int64_t lda, + std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_zunmtr(LAPACK_COL_MAJOR, to_char(side), to_char(u), to_char(trans), m, n, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } -inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, float *d, float *e, - float *tau) { +inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, float* a, int64_t lda, float* d, float* e, + float* tau) { return LAPACKE_ssytrd(LAPACK_COL_MAJOR, to_char(u), n, a, lda, d, e, tau); } -inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, double *d, double *e, - double *tau) { +inline int64_t sytrd(oneapi::mkl::uplo u, int64_t n, double* a, int64_t lda, double* d, double* e, + double* tau) { return LAPACKE_dsytrd(LAPACK_COL_MAJOR, to_char(u), n, a, lda, d, e, tau); } -inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, float *a, int64_t lda, int64_t *ipiv) { +inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, float* a, int64_t lda, int64_t* ipiv) { return LAPACKE_ssytrf(LAPACK_COL_MAJOR, to_char(u), n, a, lda, - reinterpret_cast(ipiv)); + reinterpret_cast(ipiv)); } -inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, double *a, int64_t lda, int64_t *ipiv) { +inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, double* a, int64_t lda, int64_t* ipiv) { return LAPACKE_dsytrf(LAPACK_COL_MAJOR, to_char(u), n, a, lda, - reinterpret_cast(ipiv)); + reinterpret_cast(ipiv)); } -inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - int64_t *ipiv) { +inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + int64_t* ipiv) { return LAPACKE_csytrf(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); + reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex *a, int64_t lda, - int64_t *ipiv) { +inline int64_t sytrf(oneapi::mkl::uplo u, int64_t n, std::complex* a, int64_t lda, + int64_t* ipiv) { return LAPACKE_zsytrf(LAPACK_COL_MAJOR, to_char(u), n, - reinterpret_cast(a), lda, - reinterpret_cast(ipiv)); + reinterpret_cast(a), lda, + reinterpret_cast(ipiv)); } -inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, double *a, - int64_t lda, const double *tau) { +inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, double* a, + int64_t lda, const double* tau) { LAPACKE_dorgbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, a, lda, tau); } -inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, float *a, - int64_t lda, const float *tau) { +inline void orgbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, float* a, + int64_t lda, const float* tau) { LAPACKE_sorgbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, a, lda, tau); } -inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, float *a, int64_t lda, const float *tau) { +inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, float* a, int64_t lda, const float* tau) { return LAPACKE_sorgqr(LAPACK_COL_MAJOR, m, n, k, a, lda, tau); } -inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, double *a, int64_t lda, - const double *tau) { +inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, double* a, int64_t lda, + const double* tau) { return LAPACKE_dorgqr(LAPACK_COL_MAJOR, m, n, k, a, lda, tau); } -inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex *a, int64_t lda, - const std::complex *tau) { - return LAPACKE_cungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), - lda, reinterpret_cast(tau)); +inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex* a, int64_t lda, + const std::complex* tau) { + return LAPACKE_cungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), + lda, reinterpret_cast(tau)); } -inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex *a, int64_t lda, - const std::complex *tau) { - return LAPACKE_zungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), - lda, reinterpret_cast(tau)); +inline int64_t or_un_gqr(int64_t m, int64_t n, int64_t k, std::complex* a, int64_t lda, + const std::complex* tau) { + return LAPACKE_zungqr(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), + lda, reinterpret_cast(tau)); } inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const float *a, int64_t lda, const float *tau, float *c, + int64_t k, const float* a, int64_t lda, const float* tau, float* c, int64_t ldc) { return LAPACKE_sormqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc); } inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const double *a, int64_t lda, const double *tau, double *c, + int64_t k, const double* a, int64_t lda, const double* tau, double* c, int64_t ldc) { return LAPACKE_dormqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc); } inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t k, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_cunmqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } inline int64_t or_un_mqr(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t k, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_zunmqr(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } -inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, float *a, int64_t lda, const float *tau) { +inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, float* a, int64_t lda, const float* tau) { return LAPACKE_sorgrq(LAPACK_COL_MAJOR, m, n, k, a, lda, tau); } -inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, double *a, int64_t lda, - const double *tau) { +inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, double* a, int64_t lda, + const double* tau) { return LAPACKE_dorgrq(LAPACK_COL_MAJOR, m, n, k, a, lda, tau); } -inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex *a, int64_t lda, - const std::complex *tau) { - return LAPACKE_cungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), - lda, reinterpret_cast(tau)); +inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex* a, int64_t lda, + const std::complex* tau) { + return LAPACKE_cungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), + lda, reinterpret_cast(tau)); } -inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex *a, int64_t lda, - const std::complex *tau) { - return LAPACKE_zungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), - lda, reinterpret_cast(tau)); +inline int64_t or_un_grq(int64_t m, int64_t n, int64_t k, std::complex* a, int64_t lda, + const std::complex* tau) { + return LAPACKE_zungrq(LAPACK_COL_MAJOR, m, n, k, reinterpret_cast(a), + lda, reinterpret_cast(tau)); } inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const float *a, int64_t lda, const float *tau, float *c, + int64_t k, const float* a, int64_t lda, const float* tau, float* c, int64_t ldc) { return LAPACKE_sormrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc); } inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const double *a, int64_t lda, const double *tau, double *c, + int64_t k, const double* a, int64_t lda, const double* tau, double* c, int64_t ldc) { return LAPACKE_dormrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, a, lda, tau, c, ldc); } inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t k, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_cunmrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } inline int64_t or_un_mrq(oneapi::mkl::side s, oneapi::mkl::transpose t, int64_t m, int64_t n, - int64_t k, const std::complex *a, int64_t lda, - const std::complex *tau, std::complex *c, int64_t ldc) { + int64_t k, const std::complex* a, int64_t lda, + const std::complex* tau, std::complex* c, int64_t ldc) { return LAPACKE_zunmrq(LAPACK_COL_MAJOR, to_char(s), to_char(t), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau), - reinterpret_cast(c), ldc); + reinterpret_cast(a), lda, + reinterpret_cast(tau), + reinterpret_cast(c), ldc); } -inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex *a, +inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex* a, int64_t lda) { return LAPACKE_cpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, double *a, int64_t lda) { +inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, double* a, int64_t lda) { return LAPACKE_dpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda); } -inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, float *a, int64_t lda) { +inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, float* a, int64_t lda) { return LAPACKE_spotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda); } -inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex *a, +inline int64_t potrf(oneapi::mkl::uplo upper_lower, int64_t n, std::complex* a, int64_t lda) { return LAPACKE_zpotrf(LAPACK_COL_MAJOR, to_char(upper_lower), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, - const std::complex *a, int64_t lda, std::complex *b, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb) { return LAPACKE_cpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } -inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const double *a, - int64_t lda, double *b, int64_t ldb) { +inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const double* a, + int64_t lda, double* b, int64_t ldb) { return LAPACKE_dpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, a, lda, b, ldb); } -inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const float *a, - int64_t lda, float *b, int64_t ldb) { +inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, const float* a, + int64_t lda, float* b, int64_t ldb) { return LAPACKE_spotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, a, lda, b, ldb); } inline int64_t potrs(oneapi::mkl::uplo upper_lower, int64_t n, int64_t nrhs, - const std::complex *a, int64_t lda, std::complex *b, + const std::complex* a, int64_t lda, std::complex* b, int64_t ldb) { return LAPACKE_zpotrs(LAPACK_COL_MAJOR, to_char(upper_lower), n, nrhs, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } -inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex *a, +inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex* a, int64_t lda) { return LAPACKE_cpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, double *a, int64_t lda) { +inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, double* a, int64_t lda) { return LAPACKE_dpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda); } -inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, float *a, int64_t lda) { +inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, float* a, int64_t lda) { return LAPACKE_spotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, a, lda); } -inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex *a, +inline int64_t potri(oneapi::mkl::uplo upper_lower, int64_t n, std::complex* a, int64_t lda) { return LAPACKE_zpotri(LAPACK_COL_MAJOR, to_char(upper_lower), n, - reinterpret_cast(a), lda); + reinterpret_cast(a), lda); } -inline int64_t laswp(int64_t n, std::complex *a, int64_t lda, int64_t k1, int64_t k2, - const int64_t *ipiv, int64_t incx) { - return LAPACKE_claswp(LAPACK_COL_MAJOR, n, reinterpret_cast(a), lda, k1, - k2, reinterpret_cast(ipiv), incx); +inline int64_t laswp(int64_t n, std::complex* a, int64_t lda, int64_t k1, int64_t k2, + const int64_t* ipiv, int64_t incx) { + return LAPACKE_claswp(LAPACK_COL_MAJOR, n, reinterpret_cast(a), lda, k1, + k2, reinterpret_cast(ipiv), incx); } -inline int64_t laswp(int64_t n, double *a, int64_t lda, int64_t k1, int64_t k2, const int64_t *ipiv, +inline int64_t laswp(int64_t n, double* a, int64_t lda, int64_t k1, int64_t k2, const int64_t* ipiv, int64_t incx) { return LAPACKE_dlaswp(LAPACK_COL_MAJOR, n, a, lda, k1, k2, - reinterpret_cast(ipiv), incx); + reinterpret_cast(ipiv), incx); } -inline int64_t laswp(int64_t n, float *a, int64_t lda, int64_t k1, int64_t k2, const int64_t *ipiv, +inline int64_t laswp(int64_t n, float* a, int64_t lda, int64_t k1, int64_t k2, const int64_t* ipiv, int64_t incx) { return LAPACKE_slaswp(LAPACK_COL_MAJOR, n, a, lda, k1, k2, - reinterpret_cast(ipiv), incx); + reinterpret_cast(ipiv), incx); } -inline int64_t laswp(int64_t n, std::complex *a, int64_t lda, int64_t k1, int64_t k2, - const int64_t *ipiv, int64_t incx) { - return LAPACKE_zlaswp(LAPACK_COL_MAJOR, n, reinterpret_cast(a), lda, - k1, k2, reinterpret_cast(ipiv), incx); +inline int64_t laswp(int64_t n, std::complex* a, int64_t lda, int64_t k1, int64_t k2, + const int64_t* ipiv, int64_t incx) { + return LAPACKE_zlaswp(LAPACK_COL_MAJOR, n, reinterpret_cast(a), lda, k1, + k2, reinterpret_cast(ipiv), incx); } inline void ungbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, - std::complex *a, int64_t lda, const std::complex *tau) { + std::complex* a, int64_t lda, const std::complex* tau) { LAPACKE_cungbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, + reinterpret_cast(tau)); } inline void ungbr(oneapi::mkl::generate vect, int64_t m, int64_t n, int64_t k, - std::complex *a, int64_t lda, const std::complex *tau) { + std::complex* a, int64_t lda, const std::complex* tau) { LAPACKE_zungbr(LAPACK_COL_MAJOR, to_char(vect), m, n, k, - reinterpret_cast(a), lda, - reinterpret_cast(tau)); + reinterpret_cast(a), lda, + reinterpret_cast(tau)); } inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - int64_t n, int64_t nrhs, const float *a, int64_t lda, float *b, int64_t ldb) { + int64_t n, int64_t nrhs, const float* a, int64_t lda, float* b, int64_t ldb) { return LAPACKE_strtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs, a, lda, b, ldb); } inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - int64_t n, int64_t nrhs, const double *a, int64_t lda, double *b, + int64_t n, int64_t nrhs, const double* a, int64_t lda, double* b, int64_t ldb) { return LAPACKE_dtrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs, a, lda, b, ldb); } inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - int64_t n, int64_t nrhs, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb) { + int64_t n, int64_t nrhs, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb) { return LAPACKE_ctrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } inline int64_t trtrs(oneapi::mkl::uplo uplo, oneapi::mkl::transpose trans, oneapi::mkl::diag diag, - int64_t n, int64_t nrhs, const std::complex *a, int64_t lda, - std::complex *b, int64_t ldb) { + int64_t n, int64_t nrhs, const std::complex* a, int64_t lda, + std::complex* b, int64_t ldb) { return LAPACKE_ztrtrs(LAPACK_COL_MAJOR, to_char(uplo), to_char(trans), to_char(diag), n, nrhs, - reinterpret_cast(a), lda, - reinterpret_cast(b), ldb); + reinterpret_cast(a), lda, + reinterpret_cast(b), ldb); } } //namespace reference diff --git a/tests/unit_tests/main_test.cpp b/tests/unit_tests/main_test.cpp index fc208da09..b8467e96f 100644 --- a/tests/unit_tests/main_test.cpp +++ b/tests/unit_tests/main_test.cpp @@ -133,7 +133,7 @@ int main(int argc, char** argv) { if (dev.is_gpu() && vendor_id == AMD_ID) continue; #endif -// clang-format off + // clang-format off #ifdef __HIPSYCL__ if (dev.is_accelerator()) #else diff --git a/tests/unit_tests/rng/device/include/moments.hpp b/tests/unit_tests/rng/device/include/moments.hpp index 8acf20bf9..51fe22bcb 100644 --- a/tests/unit_tests/rng/device/include/moments.hpp +++ b/tests/unit_tests/rng/device/include/moments.hpp @@ -59,9 +59,8 @@ class moments_test { std::is_same_v< Distribution, oneapi::mkl::rng::device::poisson< - std::int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>)&&!queue - .get_device() - .has(sycl::aspect::fp64)) { + std::int32_t, oneapi::mkl::rng::device::poisson_method::devroye>>) && + !queue.get_device().has(sycl::aspect::fp64)) { status = test_skipped; return; } diff --git a/tests/unit_tests/rng/device/include/rng_device_test_common.hpp b/tests/unit_tests/rng/device/include/rng_device_test_common.hpp index 6b014f0ec..aaa2f487a 100644 --- a/tests/unit_tests/rng/device/include/rng_device_test_common.hpp +++ b/tests/unit_tests/rng/device/include/rng_device_test_common.hpp @@ -34,10 +34,9 @@ #define N_GEN_SERVICE (N_ENGINES * N_PORTION) // defines for skip_ahead_ex tests -#define N_SKIP ((std::uint64_t)pow(2, 62)) -#define SKIP_TIMES ((std::int32_t)pow(2, 14)) -#define NUM_TO_SKIP \ - { 0, (std::uint64_t)pow(2, 12) } +#define N_SKIP ((std::uint64_t)pow(2, 62)) +#define SKIP_TIMES ((std::int32_t)pow(2, 14)) +#define NUM_TO_SKIP { 0, (std::uint64_t)pow(2, 12) } // Correctness checking. static inline bool check_equal_device(float x, float x_ref) { diff --git a/tests/unit_tests/rng/include/rng_test_common.hpp b/tests/unit_tests/rng/include/rng_test_common.hpp index d01b04cce..7f4c97e7a 100644 --- a/tests/unit_tests/rng/include/rng_test_common.hpp +++ b/tests/unit_tests/rng/include/rng_test_common.hpp @@ -34,10 +34,9 @@ #define N_GEN_SERVICE (N_ENGINES * N_PORTION) // defines for skip_ahead_ex tests -#define N_SKIP ((std::uint64_t)pow(2, 62)) -#define SKIP_TIMES ((std::int32_t)pow(2, 14)) -#define NUM_TO_SKIP \ - { 0, (std::uint64_t)pow(2, 12) } +#define N_SKIP ((std::uint64_t)pow(2, 62)) +#define SKIP_TIMES ((std::int32_t)pow(2, 14)) +#define NUM_TO_SKIP { 0, (std::uint64_t)pow(2, 12) } // Correctness checking. static inline bool check_equal(float x, float x_ref) { diff --git a/tests/unit_tests/sparse_blas/include/common_sparse_reference.hpp b/tests/unit_tests/sparse_blas/include/common_sparse_reference.hpp index 62b213100..675d8930a 100644 --- a/tests/unit_tests/sparse_blas/include/common_sparse_reference.hpp +++ b/tests/unit_tests/sparse_blas/include/common_sparse_reference.hpp @@ -55,9 +55,9 @@ inline T opVal(const T t, const bool isConj) { }; template -void do_csr_transpose(const oneapi::mkl::transpose opA, intType *ia_t, intType *ja_t, fpType *a_t, - intType a_nrows, intType a_ncols, intType indexing, accIntType &ia, - accIntType &ja, accFpType &a, const bool structOnlyFlag = false) { +void do_csr_transpose(const oneapi::mkl::transpose opA, intType* ia_t, intType* ja_t, fpType* a_t, + intType a_nrows, intType a_ncols, intType indexing, accIntType& ia, + accIntType& ja, accFpType& a, const bool structOnlyFlag = false) { const bool isConj = (opA == oneapi::mkl::transpose::conjtrans); // initialize ia_t to zero @@ -105,7 +105,7 @@ void do_csr_transpose(const oneapi::mkl::transpose opA, intType *ia_t, intType * // Transpose the given sparse matrix if needed template -auto sparse_transpose_if_needed(const intType *ia, const intType *ja, const fpType *a, +auto sparse_transpose_if_needed(const intType* ia, const intType* ja, const fpType* a, intType a_nrows, intType a_ncols, std::size_t nnz, intType indexing, oneapi::mkl::transpose transpose_val) { std::vector iopa; @@ -134,7 +134,7 @@ auto sparse_transpose_if_needed(const intType *ia, const intType *ja, const fpTy /// Reduce the leading dimension to the minimum and transpose the matrix if needed /// The outputted matrix always uses row major layout template -auto extract_dense_matrix(const fpType *x, std::size_t nrows, std::size_t ncols, std::size_t ld, +auto extract_dense_matrix(const fpType* x, std::size_t nrows, std::size_t ncols, std::size_t ld, oneapi::mkl::transpose transpose_val, oneapi::mkl::layout dense_matrix_layout) { const bool is_row_major = dense_matrix_layout == oneapi::mkl::layout::row_major; @@ -161,8 +161,8 @@ auto extract_dense_matrix(const fpType *x, std::size_t nrows, std::size_t ncols, /// Convert the sparse matrix in the given format to a dense matrix A in row major layout applied with A_view. template -std::vector sparse_to_dense(sparse_matrix_format_t format, const intType *ia, - const intType *ja, const fpType *a, std::size_t a_nrows, +std::vector sparse_to_dense(sparse_matrix_format_t format, const intType* ia, + const intType* ja, const fpType* a, std::size_t a_nrows, std::size_t a_ncols, std::size_t nnz, intType indexing, oneapi::mkl::transpose transpose_val, oneapi::mkl::sparse::matrix_view A_view) { diff --git a/tests/unit_tests/sparse_blas/include/test_common.hpp b/tests/unit_tests/sparse_blas/include/test_common.hpp index 3848c5bf1..6637e0daa 100644 --- a/tests/unit_tests/sparse_blas/include/test_common.hpp +++ b/tests/unit_tests/sparse_blas/include/test_common.hpp @@ -93,16 +93,16 @@ get_all_matrix_properties_combinations(sycl::queue queue, sparse_matrix_format_t return properties_combinations; } -void print_error_code(sycl::exception const &e); +void print_error_code(sycl::exception const& e); // Catch asynchronous exceptions. struct exception_handler_t { void operator()(sycl::exception_list exceptions) { - for (std::exception_ptr const &e : exceptions) { + for (std::exception_ptr const& e : exceptions) { try { std::rethrow_exception(e); } - catch (sycl::exception const &e) { + catch (sycl::exception const& e) { std::cout << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl; print_error_code(e); } @@ -113,7 +113,7 @@ struct exception_handler_t { struct UsmDeleter { sycl::queue q; UsmDeleter(sycl::queue _q) : q(_q) {} - void operator()(void *ptr) { + void operator()(void* ptr) { sycl::free(ptr, q); } }; @@ -126,14 +126,14 @@ auto malloc_device_uptr(sycl::queue q, std::size_t num_elts) { // SYCL buffer creation helper. template -sycl::buffer make_buffer(const vec &v) { +sycl::buffer make_buffer(const vec& v) { sycl::buffer buf(v.data(), sycl::range<1>(v.size())); return buf; } template -void copy_host_to_buffer(sycl::queue queue, const std::vector &src, sycl::buffer dst) { - queue.submit([&](sycl::handler &cgh) { +void copy_host_to_buffer(sycl::queue queue, const std::vector& src, sycl::buffer dst) { + queue.submit([&](sycl::handler& cgh) { auto dst_acc = dst.template get_access( cgh, sycl::range<1>(src.size())); cgh.copy(src.data(), dst_acc); @@ -195,7 +195,7 @@ struct rand_scalar> { }; template -void rand_vector(std::vector &v, std::size_t n) { +void rand_vector(std::vector& v, std::size_t n) { using fpRealType = typename complex_info::real_type; v.resize(n); rand_scalar rand; @@ -205,7 +205,7 @@ void rand_vector(std::vector &v, std::size_t n) { } template -void rand_matrix(std::vector &m, oneapi::mkl::layout layout_val, std::size_t nrows, +void rand_matrix(std::vector& m, oneapi::mkl::layout layout_val, std::size_t nrows, std::size_t ncols, std::size_t ld, oneapi::mkl::transpose transpose_val = oneapi::mkl::transpose::nontrans) { using fpRealType = typename complex_info::real_type; @@ -248,8 +248,8 @@ fpType generate_data(bool is_diag) { template intType generate_random_csr_matrix(const intType nrows, const intType ncols, const double density_val, intType indexing, - std::vector &ia, std::vector &ja, - std::vector &a, bool is_symmetric, + std::vector& ia, std::vector& ja, + std::vector& a, bool is_symmetric, bool require_diagonal = false) { intType nnz = 0; rand_scalar rand_density; @@ -299,8 +299,8 @@ intType generate_random_csr_matrix(const intType nrows, const intType ncols, template intType generate_random_coo_matrix(const intType nrows, const intType ncols, const double density_val, intType indexing, - std::vector &ia, std::vector &ja, - std::vector &a, bool is_symmetric, + std::vector& ia, std::vector& ja, + std::vector& a, bool is_symmetric, bool require_diagonal = false) { rand_scalar rand_density; @@ -342,8 +342,8 @@ intType generate_random_coo_matrix(const intType nrows, const intType ncols, template intType generate_random_matrix(sparse_matrix_format_t format, const intType nrows, const intType ncols, const double density_val, intType indexing, - std::vector &ia, std::vector &ja, - std::vector &a, bool is_symmetric, + std::vector& ia, std::vector& ja, + std::vector& a, bool is_symmetric, bool require_diagonal = false) { ia.clear(); ja.clear(); @@ -366,8 +366,8 @@ intType generate_random_matrix(sparse_matrix_format_t format, const intType nrow template void shuffle_sparse_matrix_if_needed( sparse_matrix_format_t format, - const std::set &matrix_properties, intType indexing, - intType *ia, intType *ja, fpType *a, intType nnz, std::size_t nrows) { + const std::set& matrix_properties, intType indexing, + intType* ia, intType* ja, fpType* a, intType nnz, std::size_t nrows) { const bool is_sorted = matrix_properties.find(oneapi::mkl::sparse::matrix_property::sorted) != matrix_properties.cend(); if (is_sorted) { @@ -426,8 +426,8 @@ void shuffle_sparse_matrix_if_needed( /// Initialize a sparse matrix specified by the given format template -void init_sparse_matrix(sycl::queue &queue, sparse_matrix_format_t format, - oneapi::mkl::sparse::matrix_handle_t *p_smhandle, std::int64_t num_rows, +void init_sparse_matrix(sycl::queue& queue, sparse_matrix_format_t format, + oneapi::mkl::sparse::matrix_handle_t* p_smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, ContainerIndexT rows, ContainerIndexT cols, ContainerValueT vals) { if (format == sparse_matrix_format_t::CSR) { @@ -446,7 +446,7 @@ void init_sparse_matrix(sycl::queue &queue, sparse_matrix_format_t format, /// Reset the data of a sparse matrix specified by the given format template -void set_matrix_data(sycl::queue &queue, sparse_matrix_format_t format, +void set_matrix_data(sycl::queue& queue, sparse_matrix_format_t format, oneapi::mkl::sparse::matrix_handle_t smhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, ContainerIndexT rows, ContainerIndexT cols, ContainerValueT vals) { @@ -465,8 +465,8 @@ void set_matrix_data(sycl::queue &queue, sparse_matrix_format_t format, } template -inline void free_handles(sycl::queue &queue, const std::vector dependencies, - HandlesT &&... handles) { +inline void free_handles(sycl::queue& queue, const std::vector dependencies, + HandlesT&&... handles) { // Fold expression so that handles expands to each value one after the other. ( [&] { @@ -495,19 +495,19 @@ inline void free_handles(sycl::queue &queue, const std::vector depe } template -inline void free_handles(sycl::queue &queue, HandlesT &&... handles) { +inline void free_handles(sycl::queue& queue, HandlesT&&... handles) { free_handles(queue, {}, handles...); } template -inline void wait_and_free_handles(sycl::queue &queue, HandlesT &&... handles) { +inline void wait_and_free_handles(sycl::queue& queue, HandlesT&&... handles) { queue.wait(); free_handles(queue, handles...); } inline bool require_square_matrix( oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties) { + const std::set& matrix_properties) { const bool is_symmetric = matrix_properties.find(oneapi::mkl::sparse::matrix_property::symmetric) != matrix_properties.cend(); @@ -516,7 +516,7 @@ inline bool require_square_matrix( template bool check_equal(fpType x, fpType x_ref, double abs_error_margin, double rel_error_margin, - std::ostream &out) { + std::ostream& out) { using fpRealType = typename complex_info::real_type; static_assert(std::is_floating_point_v, "Expected floating-point real or complex type."); @@ -537,8 +537,8 @@ bool check_equal(fpType x, fpType x_ref, double abs_error_margin, double rel_err } template -bool check_equal_vector(const vecType1 &v, const vecType2 &v_ref, double abs_error_factor = 10.0, - double rel_error_factor = 200.0, std::ostream &out = std::cout) { +bool check_equal_vector(const vecType1& v, const vecType2& v_ref, double abs_error_factor = 10.0, + double rel_error_factor = 200.0, std::ostream& out = std::cout) { using T = typename vecType2::value_type; std::size_t n = v.size(); if (n != v_ref.size()) { @@ -551,7 +551,7 @@ bool check_equal_vector(const vecType1 &v, const vecType2 &v_ref, double abs_err auto max_norm_ref = *std::max_element(std::begin(v_ref), std::end(v_ref), - [](const T &a, const T &b) { return std::abs(a) < std::abs(b); }); + [](const T& a, const T& b) { return std::abs(a) < std::abs(b); }); // Heuristic for the average-case error margins double abs_error_margin = abs_error_factor * std::abs(max_norm_ref) * std::log2(static_cast(n)); diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp index 983c0e63c..4573dc72a 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp @@ -53,11 +53,11 @@ */ template void test_helper_with_format_with_transpose( - testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device *dev, + testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device* dev, sparse_matrix_format_t format, - const std::vector &non_default_algorithms, - oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, int &num_passed, - int &num_skipped) { + const std::vector& non_default_algorithms, + oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, int& num_passed, + int& num_skipped) { sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value()(0.f, 0.f); @@ -233,10 +233,10 @@ void test_helper_with_format_with_transpose( */ template void test_helper_with_format( - testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device *dev, + testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device* dev, sparse_matrix_format_t format, - const std::vector &non_default_algorithms, int &num_passed, - int &num_skipped) { + const std::vector& non_default_algorithms, int& num_passed, + int& num_skipped) { std::vector transpose_vals{ oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans }; @@ -261,7 +261,7 @@ void test_helper_with_format( */ template void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, - sycl::device *dev, int &num_passed, int &num_skipped) { + sycl::device* dev, int& num_passed, int& num_skipped) { test_helper_with_format( test_functor_i32, test_functor_i64, dev, sparse_matrix_format_t::CSR, { oneapi::mkl::sparse::spmm_alg::no_optimize_alg, oneapi::mkl::sparse::spmm_alg::csr_alg1, @@ -277,14 +277,14 @@ void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i6 /// Compute spmm reference as a dense operation template -void prepare_reference_spmm_data(sparse_matrix_format_t format, const intType *ia, - const intType *ja, const fpType *a, intType a_nrows, +void prepare_reference_spmm_data(sparse_matrix_format_t format, const intType* ia, + const intType* ja, const fpType* a, intType a_nrows, intType a_ncols, intType c_ncols, intType a_nnz, intType indexing, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, fpType alpha, fpType beta, intType ldb, intType ldc, - const fpType *b, oneapi::mkl::sparse::matrix_view A_view, - fpType *c_ref) { + const fpType* b, oneapi::mkl::sparse::matrix_view A_view, + fpType* c_ref) { std::size_t a_nrows_u = static_cast(a_nrows); std::size_t a_ncols_u = static_cast(a_ncols); std::size_t c_ncols_u = static_cast(c_ncols); @@ -316,7 +316,7 @@ void prepare_reference_spmm_data(sparse_matrix_format_t format, const intType *i for (std::size_t i = 0; i < opa_ncols; i++) { acc += dense_opa[row * opa_ncols + i] * dense_opb[i * c_ncols_u + col]; } - fpType &c = c_ref[dense_linear_idx(row, col, ldc_u)]; + fpType& c = c_ref[dense_linear_idx(row, col, ldc_u)]; c = alpha * acc + beta * c; } } diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp index e22b5a9e7..15abbf62e 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp @@ -52,10 +52,10 @@ */ template void test_helper_with_format_with_transpose( - testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device *dev, + testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device* dev, sparse_matrix_format_t format, - const std::vector &non_default_algorithms, - oneapi::mkl::transpose transpose_val, int &num_passed, int &num_skipped) { + const std::vector& non_default_algorithms, + oneapi::mkl::transpose transpose_val, int& num_passed, int& num_skipped) { sycl::property_list queue_properties; double density_A_matrix = 0.8; fpType fp_zero = set_fp_value()(0.f, 0.f); @@ -226,10 +226,10 @@ void test_helper_with_format_with_transpose( */ template void test_helper_with_format( - testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device *dev, + testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, sycl::device* dev, sparse_matrix_format_t format, - const std::vector &non_default_algorithms, int &num_passed, - int &num_skipped) { + const std::vector& non_default_algorithms, int& num_passed, + int& num_skipped) { std::vector transpose_vals{ oneapi::mkl::transpose::nontrans, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::conjtrans }; @@ -252,7 +252,7 @@ void test_helper_with_format( */ template void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, - sycl::device *dev, int &num_passed, int &num_skipped) { + sycl::device* dev, int& num_passed, int& num_skipped) { test_helper_with_format( test_functor_i32, test_functor_i64, dev, sparse_matrix_format_t::CSR, { oneapi::mkl::sparse::spmv_alg::no_optimize_alg, oneapi::mkl::sparse::spmv_alg::csr_alg1, @@ -267,12 +267,12 @@ void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i6 /// Compute spmv reference as a dense operation template -void prepare_reference_spmv_data(sparse_matrix_format_t format, const intType *ia, - const intType *ja, const fpType *a, intType a_nrows, +void prepare_reference_spmv_data(sparse_matrix_format_t format, const intType* ia, + const intType* ja, const fpType* a, intType a_nrows, intType a_ncols, intType a_nnz, intType indexing, oneapi::mkl::transpose opA, fpType alpha, fpType beta, - const fpType *x, oneapi::mkl::sparse::matrix_view A_view, - fpType *y_ref) { + const fpType* x, oneapi::mkl::sparse::matrix_view A_view, + fpType* y_ref) { std::size_t a_nrows_u = static_cast(a_nrows); std::size_t a_ncols_u = static_cast(a_ncols); auto [opa_nrows, opa_ncols] = swap_if_transposed(opA, a_nrows_u, a_ncols_u); diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp index aabc0f569..fc40f27bb 100644 --- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp @@ -48,9 +48,9 @@ */ template void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, - sycl::device *dev, sparse_matrix_format_t format, - oneapi::mkl::transpose transpose_val, int &num_passed, - int &num_skipped) { + sycl::device* dev, sparse_matrix_format_t format, + oneapi::mkl::transpose transpose_val, int& num_passed, + int& num_skipped) { sycl::property_list queue_properties; double density_A_matrix = 0.144; fpType alpha = set_fp_value()(1.f, 0.f); @@ -174,8 +174,8 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes */ template void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i64, - sycl::device *dev, oneapi::mkl::transpose transpose_val, int &num_passed, - int &num_skipped) { + sycl::device* dev, oneapi::mkl::transpose transpose_val, int& num_passed, + int& num_skipped) { test_helper_with_format(test_functor_i32, test_functor_i64, dev, sparse_matrix_format_t::CSR, transpose_val, num_passed, num_skipped); @@ -186,11 +186,11 @@ void test_helper(testFunctorI32 test_functor_i32, testFunctorI64 test_functor_i6 /// Compute spsv reference as a dense operation template -void prepare_reference_spsv_data(sparse_matrix_format_t format, const intType *ia, - const intType *ja, const fpType *a, intType m, intType nnz, - intType indexing, oneapi::mkl::transpose opA, const fpType *x, +void prepare_reference_spsv_data(sparse_matrix_format_t format, const intType* ia, + const intType* ja, const fpType* a, intType m, intType nnz, + intType indexing, oneapi::mkl::transpose opA, const fpType* x, fpType alpha, oneapi::mkl::sparse::matrix_view A_view, - fpType *y_ref) { + fpType* y_ref) { std::size_t mu = static_cast(m); auto dense_opa = sparse_to_dense(format, ia, ja, a, mu, mu, static_cast(nnz), indexing, opA, A_view); diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp index f76048386..50f0fb2e7 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_buffer.cpp @@ -23,19 +23,19 @@ #include "test_spmm.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spmm(sycl::device *dev, sycl::property_list queue_properties, +int test_spmm(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { if (test_scalar_on_device) { // Scalars on the device is not planned to be supported with the buffer API @@ -151,13 +151,13 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, A_view, A_handle, B_handle, &beta, C_handle, alg, descr); } } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPMM:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, B_handle, C_handle); if (descr) { sycl::event ev_release_descr; @@ -167,7 +167,7 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPMM:\n" << error.what() << std::endl; return 0; } @@ -187,7 +187,7 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpmmBufferTests : public ::testing::TestWithParam {}; +class SparseSpmmBufferTests : public ::testing::TestWithParam {}; TEST_P(SparseSpmmBufferTests, RealSinglePrecision) { using fpType = float; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp index 9618ef870..1db7c7a25 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmm_usm.cpp @@ -23,19 +23,19 @@ #include "test_spmm.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spmm(sycl::device *dev, sycl::property_list queue_properties, +int test_spmm(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, intType ncols_C, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::layout dense_matrix_layout, oneapi::mkl::transpose transpose_A, oneapi::mkl::transpose transpose_B, fpType alpha, fpType beta, intType ldb, intType ldc, oneapi::mkl::sparse::spmm_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); @@ -80,11 +80,11 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, auto alpha_usm_uptr = malloc_device_uptr(main_queue, 1); auto beta_usm_uptr = malloc_device_uptr(main_queue, 1); - intType *ia_usm = ia_usm_uptr.get(); - intType *ja_usm = ja_usm_uptr.get(); - fpType *a_usm = a_usm_uptr.get(); - fpType *b_usm = b_usm_uptr.get(); - fpType *c_usm = c_usm_uptr.get(); + intType* ia_usm = ia_usm_uptr.get(); + intType* ja_usm = ja_usm_uptr.get(); + fpType* a_usm = a_usm_uptr.get(); + fpType* b_usm = b_usm_uptr.get(); + fpType* c_usm = c_usm_uptr.get(); std::vector dependencies; // Copy host to device @@ -96,8 +96,8 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, dependencies.push_back(main_queue.memcpy(b_usm, b_host.data(), b_host.size() * sizeof(fpType))); dependencies.push_back(main_queue.memcpy(c_usm, c_host.data(), c_host.size() * sizeof(fpType))); - fpType *alpha_host_or_usm_ptr = α - fpType *beta_host_or_usm_ptr = β + fpType* alpha_host_or_usm_ptr = α + fpType* beta_host_or_usm_ptr = β if (test_scalar_on_device) { dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); @@ -188,13 +188,13 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, ev_copy = main_queue.memcpy(c_host.data(), c_usm, c_host.size() * sizeof(fpType), ev_spmm); } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPMM:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, B_handle, C_handle); if (descr) { sycl::event ev_release_descr; @@ -204,7 +204,7 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPMM:\n" << error.what() << std::endl; return 0; } @@ -227,7 +227,7 @@ int test_spmm(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpmmUsmTests : public ::testing::TestWithParam {}; +class SparseSpmmUsmTests : public ::testing::TestWithParam {}; TEST_P(SparseSpmmUsmTests, RealSinglePrecision) { using fpType = float; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp index 20a4b6f16..96328372d 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_buffer.cpp @@ -23,17 +23,17 @@ #include "test_spmv.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spmv(sycl::device *dev, sycl::property_list queue_properties, +int test_spmv(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { if (test_scalar_on_device) { // Scalars on the device is not planned to be supported with the buffer API @@ -140,13 +140,13 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, A_handle, x_handle, &beta, y_handle, alg, descr); } } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPMV:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, x_handle, y_handle); if (descr) { sycl::event ev_release_descr; @@ -156,7 +156,7 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPMV:\n" << error.what() << std::endl; return 0; } @@ -175,7 +175,7 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpmvBufferTests : public ::testing::TestWithParam {}; +class SparseSpmvBufferTests : public ::testing::TestWithParam {}; TEST_P(SparseSpmvBufferTests, RealSinglePrecision) { using fpType = float; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp index bf5fac9da..c6159aaf4 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spmv_usm.cpp @@ -23,17 +23,17 @@ #include "test_spmv.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spmv(sycl::device *dev, sycl::property_list queue_properties, +int test_spmv(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType nrows_A, intType ncols_A, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, fpType beta, oneapi::mkl::sparse::spmv_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); @@ -73,11 +73,11 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, auto alpha_usm_uptr = malloc_device_uptr(main_queue, 1); auto beta_usm_uptr = malloc_device_uptr(main_queue, 1); - intType *ia_usm = ia_usm_uptr.get(); - intType *ja_usm = ja_usm_uptr.get(); - fpType *a_usm = a_usm_uptr.get(); - fpType *x_usm = x_usm_uptr.get(); - fpType *y_usm = y_usm_uptr.get(); + intType* ia_usm = ia_usm_uptr.get(); + intType* ja_usm = ja_usm_uptr.get(); + fpType* a_usm = a_usm_uptr.get(); + fpType* x_usm = x_usm_uptr.get(); + fpType* y_usm = y_usm_uptr.get(); std::vector dependencies; // Copy host to device @@ -89,8 +89,8 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); - fpType *alpha_host_or_usm_ptr = α - fpType *beta_host_or_usm_ptr = β + fpType* alpha_host_or_usm_ptr = α + fpType* beta_host_or_usm_ptr = β if (test_scalar_on_device) { dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); dependencies.push_back(main_queue.memcpy(beta_usm_uptr.get(), &beta, sizeof(fpType))); @@ -181,13 +181,13 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, ev_copy = main_queue.memcpy(y_host.data(), y_usm, y_host.size() * sizeof(fpType), ev_spmv); } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPMV:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, x_handle, y_handle); if (descr) { sycl::event ev_release_descr; @@ -197,7 +197,7 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPMV:\n" << error.what() << std::endl; return 0; } @@ -219,7 +219,7 @@ int test_spmv(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpmvUsmTests : public ::testing::TestWithParam {}; +class SparseSpmvUsmTests : public ::testing::TestWithParam {}; TEST_P(SparseSpmvUsmTests, RealSinglePrecision) { using fpType = float; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp index 163285e07..2cf28189f 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_buffer.cpp @@ -23,16 +23,16 @@ #include "test_spsv.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spsv(sycl::device *dev, sycl::property_list queue_properties, +int test_spsv(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { if (test_scalar_on_device) { // Scalars on the device is not planned to be supported with the buffer API @@ -139,13 +139,13 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, A_handle, x_handle, y_handle, alg, descr); } } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPSV:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, x_handle, y_handle); if (descr) { sycl::event ev_release_descr; @@ -155,7 +155,7 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPSV:\n" << error.what() << std::endl; return 0; } @@ -175,7 +175,7 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpsvBufferTests : public ::testing::TestWithParam {}; +class SparseSpsvBufferTests : public ::testing::TestWithParam {}; TEST_P(SparseSpsvBufferTests, RealSinglePrecision) { using fpType = float; diff --git a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp index a5d24829f..e76b568a4 100644 --- a/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp +++ b/tests/unit_tests/sparse_blas/source/sparse_spsv_usm.cpp @@ -23,16 +23,16 @@ #include "test_spsv.hpp" -extern std::vector devices; +extern std::vector devices; namespace { template -int test_spsv(sycl::device *dev, sycl::property_list queue_properties, +int test_spsv(sycl::device* dev, sycl::property_list queue_properties, sparse_matrix_format_t format, intType m, double density_A_matrix, oneapi::mkl::index_base index, oneapi::mkl::transpose transpose_val, fpType alpha, oneapi::mkl::sparse::spsv_alg alg, oneapi::mkl::sparse::matrix_view A_view, - const std::set &matrix_properties, + const std::set& matrix_properties, bool reset_data, bool test_scalar_on_device) { sycl::queue main_queue(*dev, exception_handler_t(), queue_properties); @@ -77,11 +77,11 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, auto y_usm_uptr = malloc_device_uptr(main_queue, y_host.size()); auto alpha_usm_uptr = malloc_device_uptr(main_queue, 1); - intType *ia_usm = ia_usm_uptr.get(); - intType *ja_usm = ja_usm_uptr.get(); - fpType *a_usm = a_usm_uptr.get(); - fpType *x_usm = x_usm_uptr.get(); - fpType *y_usm = y_usm_uptr.get(); + intType* ia_usm = ia_usm_uptr.get(); + intType* ja_usm = ja_usm_uptr.get(); + fpType* a_usm = a_usm_uptr.get(); + fpType* x_usm = x_usm_uptr.get(); + fpType* y_usm = y_usm_uptr.get(); std::vector dependencies; // Copy host to device @@ -93,7 +93,7 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, dependencies.push_back(main_queue.memcpy(x_usm, x_host.data(), x_host.size() * sizeof(fpType))); dependencies.push_back(main_queue.memcpy(y_usm, y_host.data(), y_host.size() * sizeof(fpType))); - fpType *alpha_host_or_usm_ptr = α + fpType* alpha_host_or_usm_ptr = α if (test_scalar_on_device) { dependencies.push_back(main_queue.memcpy(alpha_usm_uptr.get(), &alpha, sizeof(fpType))); alpha_host_or_usm_ptr = alpha_usm_uptr.get(); @@ -177,13 +177,13 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, ev_copy = main_queue.memcpy(y_host.data(), y_usm, y_host.size() * sizeof(fpType), ev_spsv); } - catch (const sycl::exception &e) { + catch (const sycl::exception& e) { std::cout << "Caught synchronous SYCL exception during sparse SPSV:\n" << e.what() << std::endl; print_error_code(e); return 0; } - catch (const oneapi::mkl::unimplemented &e) { + catch (const oneapi::mkl::unimplemented& e) { wait_and_free_handles(main_queue, A_handle, x_handle, y_handle); if (descr) { sycl::event ev_release_descr; @@ -193,7 +193,7 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, } return test_skipped; } - catch (const std::runtime_error &error) { + catch (const std::runtime_error& error) { std::cout << "Error raised during execution of sparse SPSV:\n" << error.what() << std::endl; return 0; } @@ -216,7 +216,7 @@ int test_spsv(sycl::device *dev, sycl::property_list queue_properties, return static_cast(valid); } -class SparseSpsvUsmTests : public ::testing::TestWithParam {}; +class SparseSpsvUsmTests : public ::testing::TestWithParam {}; TEST_P(SparseSpsvUsmTests, RealSinglePrecision) { using fpType = float; From 97eef5c0edd9eb15ecb6a2d108f8b718f83cd7d1 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 12:42:37 +0200 Subject: [PATCH 39/43] Move more functions to the detail namespace --- .../cusparse/onemkl_sparse_blas_cusparse.hpp | 2 - .../backends/cusparse/cusparse_error.hpp | 26 +-- .../cusparse/cusparse_global_handle.hpp | 4 +- .../backends/cusparse/cusparse_handles.cpp | 155 ++++++------- .../backends/cusparse/cusparse_handles.hpp | 6 +- .../backends/cusparse/cusparse_helper.hpp | 21 +- .../cusparse/cusparse_scope_handle.cpp | 4 +- .../cusparse/cusparse_scope_handle.hpp | 4 +- .../backends/cusparse/cusparse_task.hpp | 30 +-- .../cusparse/operations/cusparse_spmm.cpp | 209 +++++++++--------- .../cusparse/operations/cusparse_spmv.cpp | 209 +++++++++--------- .../cusparse/operations/cusparse_spsv.cpp | 177 ++++++++------- 12 files changed, 436 insertions(+), 411 deletions(-) diff --git a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp index 6de2802f1..c8e816eeb 100644 --- a/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp +++ b/include/oneapi/mkl/sparse_blas/detail/cusparse/onemkl_sparse_blas_cusparse.hpp @@ -26,8 +26,6 @@ namespace oneapi::mkl::sparse::cusparse { -namespace detail = oneapi::mkl::sparse::detail; - #include "oneapi/mkl/sparse_blas/detail/onemkl_sparse_blas_backends.hxx" } // namespace oneapi::mkl::sparse::cusparse diff --git a/src/sparse_blas/backends/cusparse/cusparse_error.hpp b/src/sparse_blas/backends/cusparse/cusparse_error.hpp index 8d2f66c8f..738888576 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_error.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_error.hpp @@ -27,7 +27,7 @@ #include "oneapi/mkl/exceptions.hpp" -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { inline std::string cuda_result_to_str(CUresult result) { switch (result) { @@ -44,13 +44,13 @@ inline std::string cuda_result_to_str(CUresult result) { } } -#define CUDA_ERROR_FUNC(func, ...) \ - do { \ - auto res = func(__VA_ARGS__); \ - if (res != CUDA_SUCCESS) { \ - throw oneapi::mkl::exception("sparse_blas", #func, \ - "cuda error: " + cuda_result_to_str(res)); \ - } \ +#define CUDA_ERROR_FUNC(func, ...) \ + do { \ + auto res = func(__VA_ARGS__); \ + if (res != CUDA_SUCCESS) { \ + throw oneapi::mkl::exception("sparse_blas", #func, \ + "cuda error: " + detail::cuda_result_to_str(res)); \ + } \ } while (0) inline std::string cusparse_status_to_str(cusparseStatus_t status) { @@ -92,12 +92,12 @@ inline void check_status(cusparseStatus_t status, const std::string& function, } } -#define CUSPARSE_ERR_FUNC(func, ...) \ - do { \ - auto status = func(__VA_ARGS__); \ - check_status(status, #func); \ +#define CUSPARSE_ERR_FUNC(func, ...) \ + do { \ + auto status = func(__VA_ARGS__); \ + detail::check_status(status, #func); \ } while (0) -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail #endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_ERROR_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp index b77db5529..179b007f5 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_global_handle.hpp @@ -29,7 +29,7 @@ #include #include -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { template struct cusparse_global_handle { @@ -58,6 +58,6 @@ struct cusparse_global_handle { } }; -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail #endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_GLOBAL_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp index 920e32a21..ff3d8fcae 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.cpp @@ -37,12 +37,12 @@ void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, st sycl::buffer val) { auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType::value; cusparseDnVecDescr_t cu_dvhandle; - CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, get_mem(ih, acc), + CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, detail::get_mem(ih, acc), cuda_value_type); *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); }); @@ -54,10 +54,10 @@ template void init_dense_vector(sycl::queue& queue, dense_vector_handle_t* p_dvhandle, std::int64_t size, fpType* val) { auto event = queue.submit([&](sycl::handler& cgh) { - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType::value; cusparseDnVecDescr_t cu_dvhandle; CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &cu_dvhandle, size, val, cuda_value_type); *p_dvhandle = new dense_vector_handle(cu_dvhandle, val, size); @@ -72,17 +72,17 @@ void set_dense_vector_data(sycl::queue& queue, dense_vector_handle_t dvhandle, s detail::check_can_reset_value_handle(__func__, dvhandle, true); auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, - get_mem(ih, acc), cuda_value_type); + detail::get_mem(ih, acc), cuda_value_type); dvhandle->size = size; } else { CUSPARSE_ERR_FUNC(cusparseDnVecSetValues, dvhandle->backend_handle, - get_mem(ih, acc)); + detail::get_mem(ih, acc)); } dvhandle->set_buffer(val); }); @@ -96,7 +96,7 @@ void set_dense_vector_data(sycl::queue&, dense_vector_handle_t dvhandle, std::in detail::check_can_reset_value_handle(__func__, dvhandle, false); if (dvhandle->size != size) { CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateDnVec, &dvhandle->backend_handle, size, val, cuda_value_type); dvhandle->size = size; @@ -116,7 +116,7 @@ sycl::event release_dense_vector(sycl::queue& queue, dense_vector_handle_t dvhan CUSPARSE_ERR_FUNC(cusparseDestroyDnVec, dvhandle->backend_handle); delete dvhandle; }; - return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle); + return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dvhandle); } // Dense matrix @@ -126,14 +126,14 @@ void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, st sycl::buffer val) { auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_value_type = CudaEnumType::value; - auto cuda_order = get_cuda_order(dense_layout); + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType::value; + auto cuda_order = detail::get_cuda_order(dense_layout); cusparseDnMatDescr_t cu_dmhandle; CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, - get_mem(ih, acc), cuda_value_type, cuda_order); + detail::get_mem(ih, acc), cuda_value_type, cuda_order); *p_dmhandle = new dense_matrix_handle(cu_dmhandle, val, num_rows, num_cols, ld, dense_layout); }); @@ -145,11 +145,11 @@ template void init_dense_matrix(sycl::queue& queue, dense_matrix_handle_t* p_dmhandle, std::int64_t num_rows, std::int64_t num_cols, std::int64_t ld, layout dense_layout, fpType* val) { auto event = queue.submit([&](sycl::handler& cgh) { - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_value_type = CudaEnumType::value; - auto cuda_order = get_cuda_order(dense_layout); + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_value_type = detail::CudaEnumType::value; + auto cuda_order = detail::get_cuda_order(dense_layout); cusparseDnMatDescr_t cu_dmhandle; CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &cu_dmhandle, num_rows, num_cols, ld, val, cuda_value_type, cuda_order); @@ -167,14 +167,15 @@ void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, detail::check_can_reset_value_handle(__func__, dmhandle, true); auto event = queue.submit([&](sycl::handler& cgh) { auto acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; - auto cuda_order = get_cuda_order(dense_layout); + auto cuda_value_type = detail::CudaEnumType::value; + auto cuda_order = detail::get_cuda_order(dense_layout); CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, - num_cols, ld, get_mem(ih, acc), cuda_value_type, cuda_order); + num_cols, ld, detail::get_mem(ih, acc), cuda_value_type, + cuda_order); dmhandle->num_rows = num_rows; dmhandle->num_cols = num_cols; dmhandle->ld = ld; @@ -182,7 +183,7 @@ void set_dense_matrix_data(sycl::queue& queue, dense_matrix_handle_t dmhandle, } else { CUSPARSE_ERR_FUNC(cusparseDnMatSetValues, dmhandle->backend_handle, - get_mem(ih, acc)); + detail::get_mem(ih, acc)); } dmhandle->set_buffer(val); }); @@ -198,8 +199,8 @@ void set_dense_matrix_data(sycl::queue&, dense_matrix_handle_t dmhandle, std::in if (dmhandle->num_rows != num_rows || dmhandle->num_cols != num_cols || dmhandle->ld != ld || dmhandle->dense_layout != dense_layout) { CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); - auto cuda_value_type = CudaEnumType::value; - auto cuda_order = get_cuda_order(dense_layout); + auto cuda_value_type = detail::CudaEnumType::value; + auto cuda_order = detail::get_cuda_order(dense_layout); CUSPARSE_ERR_FUNC(cusparseCreateDnMat, &dmhandle->backend_handle, num_rows, num_cols, ld, val, cuda_value_type, cuda_order); dmhandle->num_rows = num_rows; @@ -222,7 +223,7 @@ sycl::event release_dense_matrix(sycl::queue& queue, dense_matrix_handle_t dmhan CUSPARSE_ERR_FUNC(cusparseDestroyDnMat, dmhandle->backend_handle); delete dmhandle; }; - return dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle); + return detail::dispatch_submit_impl_fp(__func__, queue, dependencies, functor, dmhandle); } // COO matrix @@ -235,16 +236,17 @@ void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64 auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, - get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), - cuda_index_type, cuda_index_base, cuda_value_type); + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base, + cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ind, col_ind, val, detail::sparse_format::COO, num_rows, num_cols, nnz, index); @@ -258,12 +260,12 @@ void init_coo_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType* row_ind, intType* col_ind, fpType* val) { auto event = queue.submit([&](sycl::handler& cgh) { - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &cu_smhandle, num_rows, num_cols, nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); @@ -285,16 +287,16 @@ void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int6 auto row_acc = row_ind.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, - nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), - get_mem(ih, val_acc), cuda_index_type, cuda_index_base, + nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; @@ -303,7 +305,8 @@ void set_coo_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int6 } else { CUSPARSE_ERR_FUNC(cusparseCooSetPointers, smhandle->backend_handle, - get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc)); + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc)); } smhandle->row_container.set_buffer(row_ind); smhandle->col_container.set_buffer(col_ind); @@ -321,9 +324,9 @@ void set_coo_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t nu if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCoo, &smhandle->backend_handle, num_rows, num_cols, nnz, row_ind, col_ind, val, cuda_index_type, cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; @@ -351,16 +354,17 @@ void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64 auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, - get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc), - cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type, + cuda_index_base, cuda_value_type); *p_smhandle = new matrix_handle(cu_smhandle, row_ptr, col_ind, val, detail::sparse_format::CSR, num_rows, num_cols, nnz, index); @@ -374,12 +378,12 @@ void init_csr_matrix(sycl::queue& queue, matrix_handle_t* p_smhandle, std::int64 std::int64_t num_cols, std::int64_t nnz, oneapi::mkl::index_base index, intType* row_ptr, intType* col_ind, fpType* val) { auto event = queue.submit([&](sycl::handler& cgh) { - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { // Ensure that a cusparse handle is created before any other cuSPARSE function is called. - CusparseScopedContextHandler(queue, ih).get_handle(queue); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + detail::CusparseScopedContextHandler(queue, ih).get_handle(queue); + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; cusparseSpMatDescr_t cu_smhandle; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &cu_smhandle, num_rows, num_cols, nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, @@ -402,16 +406,16 @@ void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int6 auto row_acc = row_ptr.template get_access(cgh); auto col_acc = col_ind.template get_access(cgh); auto val_acc = val.template get_access(cgh); - submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { + detail::submit_host_task(cgh, queue, [=](sycl::interop_handle ih) { if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, - nnz, get_mem(ih, row_acc), get_mem(ih, col_acc), - get_mem(ih, val_acc), cuda_index_type, cuda_index_type, + nnz, detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc), cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); smhandle->num_rows = num_rows; smhandle->num_cols = num_cols; @@ -420,7 +424,8 @@ void set_csr_matrix_data(sycl::queue& queue, matrix_handle_t smhandle, std::int6 } else { CUSPARSE_ERR_FUNC(cusparseCsrSetPointers, smhandle->backend_handle, - get_mem(ih, row_acc), get_mem(ih, col_acc), get_mem(ih, val_acc)); + detail::get_mem(ih, row_acc), detail::get_mem(ih, col_acc), + detail::get_mem(ih, val_acc)); } smhandle->row_container.set_buffer(row_ptr); smhandle->col_container.set_buffer(col_ind); @@ -438,9 +443,9 @@ void set_csr_matrix_data(sycl::queue&, matrix_handle_t smhandle, std::int64_t nu if (smhandle->num_rows != num_rows || smhandle->num_cols != num_cols || smhandle->nnz != nnz || smhandle->index != index) { CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); - auto cuda_index_type = CudaIndexEnumType::value; - auto cuda_index_base = get_cuda_index_base(index); - auto cuda_value_type = CudaEnumType::value; + auto cuda_index_type = detail::CudaIndexEnumType::value; + auto cuda_index_base = detail::get_cuda_index_base(index); + auto cuda_value_type = detail::CudaEnumType::value; CUSPARSE_ERR_FUNC(cusparseCreateCsr, &smhandle->backend_handle, num_rows, num_cols, nnz, row_ptr, col_ind, val, cuda_index_type, cuda_index_type, cuda_index_base, cuda_value_type); @@ -466,7 +471,7 @@ sycl::event release_sparse_matrix(sycl::queue& queue, matrix_handle_t smhandle, CUSPARSE_ERR_FUNC(cusparseDestroySpMat, smhandle->backend_handle); delete smhandle; }; - return dispatch_submit(__func__, queue, dependencies, functor, smhandle); + return detail::dispatch_submit(__func__, queue, dependencies, functor, smhandle); } // Matrix property diff --git a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp index 2653d84c1..5e5bdc732 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_handles.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_handles.hpp @@ -75,9 +75,11 @@ struct matrix_handle : public detail::generic_sparse_handleformat == detail::sparse_format::COO && + if (sm_handle->format == sparse_format::COO && !(sm_handle->has_matrix_property(matrix_property::sorted_by_rows) || sm_handle->has_matrix_property(matrix_property::sorted))) { throw mkl::unimplemented( @@ -86,6 +88,8 @@ inline void check_valid_matrix_properties(const std::string& function_name, } } +} // namespace detail + } // namespace oneapi::mkl::sparse #endif // _ONEMKL_SRC_SPARSE_BLAS_BACKENDS_CUSPARSE_HANDLES_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp index b392071f5..3feb4bcad 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_helper.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_helper.hpp @@ -31,7 +31,9 @@ #include "sparse_blas/sycl_helper.hpp" #include "cusparse_error.hpp" -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { + +using namespace oneapi::mkl::sparse::detail; template struct CudaEnumType; @@ -68,12 +70,12 @@ inline std::string cast_enum_to_str(E e) { return std::to_string(static_cast(e)); } -inline cudaDataType_t get_cuda_value_type(detail::data_type onemkl_data_type) { +inline cudaDataType_t get_cuda_value_type(data_type onemkl_data_type) { switch (onemkl_data_type) { - case detail::data_type::real_fp32: return CUDA_R_32F; - case detail::data_type::real_fp64: return CUDA_R_64F; - case detail::data_type::complex_fp32: return CUDA_C_32F; - case detail::data_type::complex_fp64: return CUDA_C_64F; + case data_type::real_fp32: return CUDA_R_32F; + case data_type::real_fp64: return CUDA_R_64F; + case data_type::complex_fp32: return CUDA_C_32F; + case data_type::complex_fp64: return CUDA_C_64F; default: throw oneapi::mkl::invalid_argument( "sparse_blas", "get_cuda_value_type", @@ -103,13 +105,12 @@ inline cusparseIndexBase_t get_cuda_index_base(index_base index) { /// Return the CUDA transpose operation from a oneMKL type. /// Do not conjugate for real types to avoid an invalid argument. -inline cusparseOperation_t get_cuda_operation(detail::data_type type, transpose op) { +inline cusparseOperation_t get_cuda_operation(data_type type, transpose op) { switch (op) { case transpose::nontrans: return CUSPARSE_OPERATION_NON_TRANSPOSE; case transpose::trans: return CUSPARSE_OPERATION_TRANSPOSE; case transpose::conjtrans: - return (type == detail::data_type::complex_fp32 || - type == detail::data_type::complex_fp64) + return (type == data_type::complex_fp32 || type == data_type::complex_fp64) ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; default: @@ -160,6 +161,6 @@ inline void set_pointer_mode(cusparseHandle_t cu_handle, bool is_ptr_host_access : CUSPARSE_POINTER_MODE_DEVICE); } -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail #endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_HELPER_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp index 8b48d16dd..4d92daf35 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.cpp @@ -23,7 +23,7 @@ #include "cusparse_scope_handle.hpp" -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { /** * Inserts a new element in the map if its key is unique. This new element @@ -144,4 +144,4 @@ sycl::context CusparseScopedContextHandler::get_context(const sycl::queue& queue return queue.get_context(); } -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail diff --git a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp index 4b1ecd3e4..7b8313ee6 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_scope_handle.hpp @@ -40,7 +40,7 @@ #include "cusparse_global_handle.hpp" #include "cusparse_helper.hpp" -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { class CusparseScopedContextHandler { CUcontext original_; @@ -83,6 +83,6 @@ inline void* get_mem(sycl::interop_handle ih, AccT acc) { return reinterpret_cast(cudaPtr); } -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail #endif //_ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_SCOPE_HANDLE_HPP_ diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index a78c4ed74..0d170d47b 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -25,7 +25,7 @@ /// This file provide a helper function to submit host_task using buffers or USM seamlessly -namespace oneapi::mkl::sparse::cusparse { +namespace oneapi::mkl::sparse::cusparse::detail { template auto get_value_accessor(sycl::handler& cgh, Container container) { @@ -198,8 +198,8 @@ sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl:: Ts... other_containers) { bool is_in_order_queue = queue.is_in_order(); if (sm_handle->all_use_buffer()) { - detail::data_type value_type = sm_handle->get_value_type(); - detail::data_type int_type = sm_handle->get_int_type(); + data_type value_type = sm_handle->get_value_type(); + data_type int_type = sm_handle->get_int_type(); #define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, INT_TYPE) \ return queue.submit([&](sycl::handler& cgh) { \ @@ -239,23 +239,23 @@ sycl::event dispatch_submit_impl_fp_int(const std::string& function_name, sycl:: } \ }) #define ONEMKL_CUSPARSE_SUBMIT_INT(FP_TYPE) \ - if (int_type == detail::data_type::int32) { \ + if (int_type == data_type::int32) { \ ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int32_t); \ } \ - else if (int_type == detail::data_type::int64) { \ + else if (int_type == data_type::int64) { \ ONEMKL_CUSPARSE_SUBMIT(FP_TYPE, std::int64_t); \ } - if (value_type == detail::data_type::real_fp32) { + if (value_type == data_type::real_fp32) { ONEMKL_CUSPARSE_SUBMIT_INT(float) } - else if (value_type == detail::data_type::real_fp64) { + else if (value_type == data_type::real_fp64) { ONEMKL_CUSPARSE_SUBMIT_INT(double) } - else if (value_type == detail::data_type::complex_fp32) { + else if (value_type == data_type::complex_fp32) { ONEMKL_CUSPARSE_SUBMIT_INT(std::complex) } - else if (value_type == detail::data_type::complex_fp64) { + else if (value_type == data_type::complex_fp64) { ONEMKL_CUSPARSE_SUBMIT_INT(std::complex) } @@ -296,7 +296,7 @@ sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queu const std::vector& dependencies, Functor functor, ContainerT container_handle) { if (container_handle->all_use_buffer()) { - detail::data_type value_type = container_handle->get_value_type(); + data_type value_type = container_handle->get_value_type(); #define ONEMKL_CUSPARSE_SUBMIT(FP_TYPE) \ return queue.submit([&](sycl::handler& cgh) { \ @@ -305,16 +305,16 @@ sycl::event dispatch_submit_impl_fp(const std::string& function_name, sycl::queu submit_host_task(cgh, queue, functor, fp_accs); \ }) - if (value_type == detail::data_type::real_fp32) { + if (value_type == data_type::real_fp32) { ONEMKL_CUSPARSE_SUBMIT(float); } - else if (value_type == detail::data_type::real_fp64) { + else if (value_type == data_type::real_fp64) { ONEMKL_CUSPARSE_SUBMIT(double); } - else if (value_type == detail::data_type::complex_fp32) { + else if (value_type == data_type::complex_fp32) { ONEMKL_CUSPARSE_SUBMIT(std::complex); } - else if (value_type == detail::data_type::complex_fp64) { + else if (value_type == data_type::complex_fp64) { ONEMKL_CUSPARSE_SUBMIT(std::complex); } @@ -428,6 +428,6 @@ inline void synchronize_if_needed(bool is_in_order_queue, CUstream cu_stream) { #endif } -} // namespace oneapi::mkl::sparse::cusparse +} // namespace oneapi::mkl::sparse::cusparse::detail #endif // _ONEMKL_SPARSE_BLAS_BACKENDS_CUSPARSE_TASKS_HPP_ diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp index b4d8c6b77..5fd24d3f4 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmm.cpp @@ -55,46 +55,7 @@ struct spmm_descr { namespace oneapi::mkl::sparse::cusparse { -void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { - *p_spmm_descr = new spmm_descr(); -} - -sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, - const std::vector& dependencies) { - if (!spmm_descr) { - return detail::collapse_dependencies(queue, dependencies); - } - - auto release_functor = [=]() { - spmm_descr->cu_handle = nullptr; - spmm_descr->last_optimized_A_handle = nullptr; - spmm_descr->last_optimized_B_handle = nullptr; - spmm_descr->last_optimized_C_handle = nullptr; - delete spmm_descr; - }; - - // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used - // dispatch_submit can only be used if the descriptor's handles are valid - if (spmm_descr->last_optimized_A_handle && - spmm_descr->last_optimized_A_handle->all_use_buffer() && - spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle && - spmm_descr->workspace.use_buffer()) { - auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { - release_functor(); - }; - return dispatch_submit( - __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle, - spmm_descr->workspace.get_buffer(), spmm_descr->last_optimized_B_handle, - spmm_descr->last_optimized_C_handle); - } - - // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded - sycl::event event = queue.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - cgh.host_task(release_functor); - }); - return event; -} +namespace detail { inline auto get_cuda_spmm_alg(spmm_alg alg) { switch (alg) { @@ -113,8 +74,8 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o oneapi::mkl::transpose opB, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, dense_matrix_handle_t C_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible, spmm_alg alg) { - detail::check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible); + check_valid_spmm_common(function_name, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible); check_valid_matrix_properties(function_name, A_handle); if (alg == spmm_alg::csr_alg3 && opA != oneapi::mkl::transpose::nontrans) { throw mkl::unimplemented( @@ -127,7 +88,7 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o "The backend does not support spmm with the algorithm `spmm_alg::csr_alg3` if `opB` is `transpose::conjtrans`."); } if (alg == spmm_alg::csr_alg3 && opB == oneapi::mkl::transpose::trans && - A_handle->get_value_type() == detail::data_type::real_fp64) { + A_handle->get_value_type() == data_type::real_fp64) { // TODO: Remove once the issue is fixed: https://forums.developer.nvidia.com/t/cusparse-spmm-sample-failing-with-misaligned-address/311022 throw mkl::unimplemented( "sparse_blas", function_name, @@ -135,39 +96,6 @@ void check_valid_spmm(const std::string& function_name, oneapi::mkl::transpose o } } -void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, - const void* alpha, matrix_view A_view, matrix_handle_t A_handle, - dense_matrix_handle_t B_handle, const void* beta, - dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, - std::size_t& temp_buffer_size) { - bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible, alg); - auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { - CusparseScopedContextHandler sc(queue, ih); - auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); - spmm_descr->cu_handle = cu_handle; - spmm_descr->cu_stream = cu_stream; - auto cu_a = A_handle->backend_handle; - auto cu_b = B_handle->backend_handle; - auto cu_c = C_handle->backend_handle; - auto type = A_handle->value_container.data_type; - auto cu_op_a = get_cuda_operation(type, opA); - auto cu_op_b = get_cuda_operation(type, opB); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spmm_alg(alg); - set_pointer_mode(cu_handle, is_alpha_host_accessible); - auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, - cu_c, cu_type, cu_alg, &temp_buffer_size); - check_status(status, __func__); - }; - auto event = dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle); - event.wait_and_throw(); - spmm_descr->temp_buffer_size = temp_buffer_size; - spmm_descr->buffer_size_called = true; -} - inline void common_spmm_optimize(oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, bool is_alpha_host_accessible, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, @@ -208,6 +136,82 @@ void spmm_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, check_status(status, "spmm_optimize"); } +} // namespace detail + +void init_spmm_descr(sycl::queue& /*queue*/, spmm_descr_t* p_spmm_descr) { + *p_spmm_descr = new spmm_descr(); +} + +sycl::event release_spmm_descr(sycl::queue& queue, spmm_descr_t spmm_descr, + const std::vector& dependencies) { + if (!spmm_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmm_descr->cu_handle = nullptr; + spmm_descr->last_optimized_A_handle = nullptr; + spmm_descr->last_optimized_B_handle = nullptr; + spmm_descr->last_optimized_C_handle = nullptr; + delete spmm_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmm_descr->last_optimized_A_handle && + spmm_descr->last_optimized_A_handle->all_use_buffer() && + spmm_descr->last_optimized_B_handle && spmm_descr->last_optimized_C_handle && + spmm_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { + release_functor(); + }; + return detail::dispatch_submit( + __func__, queue, dispatch_functor, spmm_descr->last_optimized_A_handle, + spmm_descr->workspace.get_buffer(), spmm_descr->last_optimized_B_handle, + spmm_descr->last_optimized_C_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmm_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; +} + +void spmm_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, + const void* alpha, matrix_view A_view, matrix_handle_t A_handle, + dense_matrix_handle_t B_handle, const void* beta, + dense_matrix_handle_t C_handle, spmm_alg alg, spmm_descr_t spmm_descr, + std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + detail::CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmm_descr->cu_handle = cu_handle; + spmm_descr->cu_stream = cu_stream; + auto cu_a = A_handle->backend_handle; + auto cu_b = B_handle->backend_handle; + auto cu_c = C_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op_a = detail::get_cuda_operation(type, opA); + auto cu_op_b = detail::get_cuda_operation(type, opB); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmm_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMM_bufferSize(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, + cu_c, cu_type, cu_alg, &temp_buffer_size); + detail::check_status(status, __func__); + }; + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, B_handle, C_handle); + event.wait_and_throw(); + spmm_descr->temp_buffer_size = temp_buffer_size; + spmm_descr->buffer_size_called = true; +} + void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_matrix_handle_t B_handle, const void* beta, dense_matrix_handle_t C_handle, @@ -217,8 +221,8 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: if (!A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, - is_beta_host_accessible, C_handle, alg, spmm_descr); + detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); // Copy the buffer to extend its lifetime until the descriptor is free'd. spmm_descr->workspace.set_buffer_untyped(workspace); if (alg == spmm_alg::no_optimize_alg || workspace.size() == 0) { @@ -227,12 +231,12 @@ void spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl:: } auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { auto cu_handle = spmm_descr->cu_handle; - auto workspace_ptr = get_mem(ih, workspace_acc); - spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, - workspace_ptr, is_alpha_host_accessible); + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, + alg, workspace_ptr, is_alpha_host_accessible); }; - dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle); + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, B_handle, C_handle); } sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, @@ -246,8 +250,8 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, if (A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, - is_beta_host_accessible, C_handle, alg, spmm_descr); + detail::common_spmm_optimize(opA, opB, is_alpha_host_accessible, A_view, A_handle, B_handle, + is_beta_host_accessible, C_handle, alg, spmm_descr); spmm_descr->workspace.usm_ptr = workspace; if (alg == spmm_alg::no_optimize_alg || workspace == nullptr) { // cusparseSpMM_preprocess cannot be called if the workspace is empty @@ -255,11 +259,12 @@ sycl::event spmm_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, } auto functor = [=](sycl::interop_handle) { auto cu_handle = spmm_descr->cu_handle; - spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, alg, - workspace, is_alpha_host_accessible); + detail::spmm_optimize_impl(cu_handle, opA, opB, alpha, A_handle, B_handle, beta, C_handle, + alg, workspace, is_alpha_host_accessible); }; - return dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, C_handle); + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, B_handle, + C_handle); } sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::transpose opB, @@ -269,8 +274,8 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, - is_alpha_host_accessible, is_beta_host_accessible, alg); + detail::check_valid_spmm(__func__, opA, opB, A_view, A_handle, B_handle, C_handle, + is_alpha_host_accessible, is_beta_host_accessible, alg); if (A_handle->all_use_buffer() != spmm_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } @@ -294,26 +299,26 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr auto cu_b = B_handle->backend_handle; auto cu_c = C_handle->backend_handle; auto type = A_handle->value_container.data_type; - auto cu_op_a = get_cuda_operation(type, opA); - auto cu_op_b = get_cuda_operation(type, opB); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spmm_alg(alg); - set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto cu_op_a = detail::get_cuda_operation(type, opA); + auto cu_op_b = detail::get_cuda_operation(type, opB); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmm_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpMM(cu_handle, cu_op_a, cu_op_b, alpha, cu_a, cu_b, beta, cu_c, cu_type, cu_alg, workspace_ptr); - check_status(status, __func__); - synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spmm_descr->cu_stream); }; if (A_handle->all_use_buffer() && spmm_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 auto functor_buffer = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { - auto workspace_ptr = get_mem(ih, workspace_acc); + auto workspace_ptr = detail::get_mem(ih, workspace_acc); compute_functor(workspace_ptr); }; - return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, - spmm_descr->workspace.get_buffer(), - B_handle, C_handle); + return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + spmm_descr->workspace.get_buffer(), + B_handle, C_handle); } else { // The same dispatch_submit can be used for USM or buffers if no @@ -323,8 +328,8 @@ sycl::event spmm(sycl::queue& queue, oneapi::mkl::transpose opA, oneapi::mkl::tr auto functor_usm = [=](sycl::interop_handle) { compute_functor(workspace_ptr); }; - return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, - B_handle, C_handle); + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, + A_handle, B_handle, C_handle); } } diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp index d1102b93a..03b848916 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spmv.cpp @@ -54,46 +54,7 @@ struct spmv_descr { namespace oneapi::mkl::sparse::cusparse { -void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { - *p_spmv_descr = new spmv_descr(); -} - -sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, - const std::vector& dependencies) { - if (!spmv_descr) { - return detail::collapse_dependencies(queue, dependencies); - } - - auto release_functor = [=]() { - spmv_descr->cu_handle = nullptr; - spmv_descr->last_optimized_A_handle = nullptr; - spmv_descr->last_optimized_x_handle = nullptr; - spmv_descr->last_optimized_y_handle = nullptr; - delete spmv_descr; - }; - - // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used - // dispatch_submit can only be used if the descriptor's handles are valid - if (spmv_descr->last_optimized_A_handle && - spmv_descr->last_optimized_A_handle->all_use_buffer() && - spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle && - spmv_descr->workspace.use_buffer()) { - auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { - release_functor(); - }; - return dispatch_submit( - __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle, - spmv_descr->workspace.get_buffer(), spmv_descr->last_optimized_x_handle, - spmv_descr->last_optimized_y_handle); - } - - // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded - sycl::event event = queue.submit([&](sycl::handler& cgh) { - cgh.depends_on(dependencies); - cgh.host_task(release_functor); - }); - return event; -} +namespace detail { inline auto get_cuda_spmv_alg(spmv_alg alg) { switch (alg) { @@ -109,8 +70,8 @@ void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose o matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, bool is_alpha_host_accessible, bool is_beta_host_accessible) { - detail::check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible, is_beta_host_accessible); + check_valid_spmv_common(function_name, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); check_valid_matrix_properties(function_name, A_handle); if (A_view.type_view != matrix_descr::general) { throw mkl::unimplemented( @@ -119,38 +80,6 @@ void check_valid_spmv(const std::string& function_name, oneapi::mkl::transpose o } } -void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, - matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, - const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, - spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { - bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, - is_beta_host_accessible); - - auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { - CusparseScopedContextHandler sc(queue, ih); - auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); - spmv_descr->cu_handle = cu_handle; - spmv_descr->cu_stream = cu_stream; - auto cu_a = A_handle->backend_handle; - auto cu_x = x_handle->backend_handle; - auto cu_y = y_handle->backend_handle; - auto type = A_handle->value_container.data_type; - auto cu_op = get_cuda_operation(type, opA); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spmv_alg(alg); - set_pointer_mode(cu_handle, is_alpha_host_accessible); - auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, - cu_type, cu_alg, &temp_buffer_size); - check_status(status, __func__); - }; - auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); - event.wait_and_throw(); - spmv_descr->temp_buffer_size = temp_buffer_size; - spmv_descr->buffer_size_called = true; -} - inline void common_spmv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, bool is_beta_host_accessible, @@ -191,6 +120,81 @@ void spmv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, } #endif +} // namespace detail + +void init_spmv_descr(sycl::queue& /*queue*/, spmv_descr_t* p_spmv_descr) { + *p_spmv_descr = new spmv_descr(); +} + +sycl::event release_spmv_descr(sycl::queue& queue, spmv_descr_t spmv_descr, + const std::vector& dependencies) { + if (!spmv_descr) { + return detail::collapse_dependencies(queue, dependencies); + } + + auto release_functor = [=]() { + spmv_descr->cu_handle = nullptr; + spmv_descr->last_optimized_A_handle = nullptr; + spmv_descr->last_optimized_x_handle = nullptr; + spmv_descr->last_optimized_y_handle = nullptr; + delete spmv_descr; + }; + + // Use dispatch_submit to ensure the descriptor is kept alive as long as the buffers are used + // dispatch_submit can only be used if the descriptor's handles are valid + if (spmv_descr->last_optimized_A_handle && + spmv_descr->last_optimized_A_handle->all_use_buffer() && + spmv_descr->last_optimized_x_handle && spmv_descr->last_optimized_y_handle && + spmv_descr->workspace.use_buffer()) { + auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { + release_functor(); + }; + return detail::dispatch_submit( + __func__, queue, dispatch_functor, spmv_descr->last_optimized_A_handle, + spmv_descr->workspace.get_buffer(), spmv_descr->last_optimized_x_handle, + spmv_descr->last_optimized_y_handle); + } + + // Release used if USM is used or if the descriptor has been released before spmv_optimize has succeeded + sycl::event event = queue.submit([&](sycl::handler& cgh) { + cgh.depends_on(dependencies); + cgh.host_task(release_functor); + }); + return event; +} + +void spmv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, + const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, + spmv_descr_t spmv_descr, std::size_t& temp_buffer_size) { + bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); + bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); + detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); + + auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { + detail::CusparseScopedContextHandler sc(queue, ih); + auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); + spmv_descr->cu_handle = cu_handle; + spmv_descr->cu_stream = cu_stream; + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmv_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpMV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, + cu_type, cu_alg, &temp_buffer_size); + detail::check_status(status, __func__); + }; + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + event.wait_and_throw(); + spmv_descr->temp_buffer_size = temp_buffer_size; + spmv_descr->buffer_size_called = true; +} + void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, const void* beta, dense_vector_handle_t y_handle, spmv_alg alg, @@ -200,8 +204,8 @@ void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a if (!A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, - is_beta_host_accessible, y_handle, alg, spmv_descr); + detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); // Copy the buffer to extend its lifetime until the descriptor is free'd. spmv_descr->workspace.set_buffer_untyped(workspace); if (alg == spmv_alg::no_optimize_alg) { @@ -215,21 +219,21 @@ void spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a if (spmv_descr->temp_buffer_size > 0) { auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { auto cu_handle = spmv_descr->cu_handle; - auto workspace_ptr = get_mem(ih, workspace_acc); - spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, - workspace_ptr, is_alpha_host_accessible); + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, + alg, workspace_ptr, is_alpha_host_accessible); }; // The accessor can only be created if the buffer size is greater than 0 - dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { auto functor = [=](sycl::interop_handle) { auto cu_handle = spmv_descr->cu_handle; - spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, - nullptr, is_alpha_host_accessible); + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, + alg, nullptr, is_alpha_host_accessible); }; - dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); } #endif } @@ -244,8 +248,8 @@ sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const if (A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, - is_beta_host_accessible, y_handle, alg, spmv_descr); + detail::common_spmv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + is_beta_host_accessible, y_handle, alg, spmv_descr); spmv_descr->workspace.usm_ptr = workspace; if (alg == spmv_alg::no_optimize_alg) { return detail::collapse_dependencies(queue, dependencies); @@ -257,10 +261,11 @@ sycl::event spmv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const #else auto functor = [=](sycl::interop_handle) { auto cu_handle = spmv_descr->cu_handle; - spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, - workspace, is_alpha_host_accessible); + detail::spmv_optimize_impl(cu_handle, opA, alpha, A_handle, x_handle, beta, y_handle, alg, + workspace, is_alpha_host_accessible); }; - return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, + y_handle); #endif } @@ -270,8 +275,8 @@ sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alp spmv_descr_t spmv_descr, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); bool is_beta_host_accessible = detail::is_ptr_accessible_on_host(queue, beta); - check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible, - is_beta_host_accessible); + detail::check_valid_spmv(__func__, opA, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible, is_beta_host_accessible); if (A_handle->all_use_buffer() != spmv_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } @@ -294,25 +299,25 @@ sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alp auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; auto type = A_handle->value_container.data_type; - auto cu_op = get_cuda_operation(type, opA); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spmv_alg(alg); - set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spmv_alg(alg); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpMV(cu_handle, cu_op, alpha, cu_a, cu_x, beta, cu_y, cu_type, cu_alg, workspace_ptr); - check_status(status, __func__); - synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spmv_descr->cu_stream); }; if (A_handle->all_use_buffer() && spmv_descr->temp_buffer_size > 0) { // The accessor can only be created if the buffer size is greater than 0 auto functor_buffer = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { - auto workspace_ptr = get_mem(ih, workspace_acc); + auto workspace_ptr = detail::get_mem(ih, workspace_acc); compute_functor(workspace_ptr); }; - return dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, - spmv_descr->workspace.get_buffer(), - x_handle, y_handle); + return detail::dispatch_submit_native_ext(__func__, queue, functor_buffer, A_handle, + spmv_descr->workspace.get_buffer(), + x_handle, y_handle); } else { // The same dispatch_submit can be used for USM or buffers if no @@ -322,8 +327,8 @@ sycl::event spmv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alp auto functor_usm = [=](sycl::interop_handle) { compute_functor(workspace_ptr); }; - return dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, A_handle, - x_handle, y_handle); + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor_usm, + A_handle, x_handle, y_handle); } } diff --git a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp index 4f2b60502..5c49df013 100644 --- a/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp +++ b/src/sparse_blas/backends/cusparse/operations/cusparse_spsv.cpp @@ -54,6 +54,61 @@ struct spsv_descr { namespace oneapi::mkl::sparse::cusparse { +namespace detail { + +inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { + return CUSPARSE_SPSV_ALG_DEFAULT; +} + +void check_valid_spsv(const std::string& function_name, matrix_view A_view, + matrix_handle_t A_handle, dense_vector_handle_t x_handle, + dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { + check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + check_valid_matrix_properties(function_name, A_handle); +} + +inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr) { + check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); + if (!spsv_descr->buffer_size_called) { + throw mkl::uninitialized("sparse_blas", "spsv_optimize", + "spsv_buffer_size must be called before spsv_optimize."); + } + spsv_descr->optimized_called = true; + spsv_descr->last_optimized_opA = opA; + spsv_descr->last_optimized_A_view = A_view; + spsv_descr->last_optimized_A_handle = A_handle; + spsv_descr->last_optimized_x_handle = x_handle; + spsv_descr->last_optimized_y_handle = y_handle; + spsv_descr->last_optimized_alg = alg; +} + +void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, + matrix_view A_view, matrix_handle_t A_handle, + dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, + spsv_alg alg, spsv_descr_t spsv_descr, void* workspace_ptr, + bool is_alpha_host_accessible) { + auto cu_a = A_handle->backend_handle; + auto cu_x = x_handle->backend_handle; + auto cu_y = y_handle->backend_handle; + auto type = A_handle->value_container.data_type; + set_matrix_attributes("spsv_optimize", cu_a, A_view); + auto cu_op = get_cuda_operation(type, opA); + auto cu_type = get_cuda_value_type(type); + auto cu_alg = get_cuda_spsv_alg(alg); + auto cu_descr = spsv_descr->cu_descr; + set_pointer_mode(cu_handle, is_alpha_host_accessible); + auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, + cu_descr, workspace_ptr); + check_status(status, "spsv_optimize"); +} + +} // namespace detail + void init_spsv_descr(sycl::queue& /*queue*/, spsv_descr_t* p_spsv_descr) { *p_spsv_descr = new spsv_descr(); CUSPARSE_ERR_FUNC(cusparseSpSV_createDescr, &(*p_spsv_descr)->cu_descr); @@ -84,7 +139,7 @@ sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, auto dispatch_functor = [=](sycl::interop_handle, sycl::accessor) { release_functor(); }; - return dispatch_submit( + return detail::dispatch_submit( __func__, queue, dispatch_functor, spsv_descr->last_optimized_A_handle, spsv_descr->workspace.get_buffer(), spsv_descr->last_optimized_x_handle, spsv_descr->last_optimized_y_handle); @@ -98,26 +153,15 @@ sycl::event release_spsv_descr(sycl::queue& queue, spsv_descr_t spsv_descr, return event; } -inline auto get_cuda_spsv_alg(spsv_alg /*alg*/) { - return CUSPARSE_SPSV_ALG_DEFAULT; -} - -void check_valid_spsv(const std::string& function_name, matrix_view A_view, - matrix_handle_t A_handle, dense_vector_handle_t x_handle, - dense_vector_handle_t y_handle, bool is_alpha_host_accessible) { - detail::check_valid_spsv_common(function_name, A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible); - check_valid_matrix_properties(function_name, A_handle); -} - void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, std::size_t& temp_buffer_size) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); + detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); auto functor = [=, &temp_buffer_size](sycl::interop_handle ih) { - CusparseScopedContextHandler sc(queue, ih); + detail::CusparseScopedContextHandler sc(queue, ih); auto [cu_handle, cu_stream] = sc.get_handle_and_stream(queue); spsv_descr->cu_handle = cu_handle; spsv_descr->cu_stream = cu_stream; @@ -125,60 +169,21 @@ void spsv_buffer_size(sycl::queue& queue, oneapi::mkl::transpose opA, const void auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; auto type = A_handle->value_container.data_type; - set_matrix_attributes(__func__, cu_a, A_view); - auto cu_op = get_cuda_operation(type, opA); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spsv_alg(alg); + detail::set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spsv_alg(alg); auto cu_descr = spsv_descr->cu_descr; - set_pointer_mode(cu_handle, is_alpha_host_accessible); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpSV_bufferSize(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, cu_descr, &temp_buffer_size); - check_status(status, __func__); + detail::check_status(status, __func__); }; - auto event = dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + auto event = detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); event.wait_and_throw(); spsv_descr->buffer_size_called = true; } -inline void common_spsv_optimize(oneapi::mkl::transpose opA, bool is_alpha_host_accessible, - matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr) { - check_valid_spsv("spsv_optimize", A_view, A_handle, x_handle, y_handle, - is_alpha_host_accessible); - if (!spsv_descr->buffer_size_called) { - throw mkl::uninitialized("sparse_blas", "spsv_optimize", - "spsv_buffer_size must be called before spsv_optimize."); - } - spsv_descr->optimized_called = true; - spsv_descr->last_optimized_opA = opA; - spsv_descr->last_optimized_A_view = A_view; - spsv_descr->last_optimized_A_handle = A_handle; - spsv_descr->last_optimized_x_handle = x_handle; - spsv_descr->last_optimized_y_handle = y_handle; - spsv_descr->last_optimized_alg = alg; -} - -void spsv_optimize_impl(cusparseHandle_t cu_handle, oneapi::mkl::transpose opA, const void* alpha, - matrix_view A_view, matrix_handle_t A_handle, - dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, - spsv_alg alg, spsv_descr_t spsv_descr, void* workspace_ptr, - bool is_alpha_host_accessible) { - auto cu_a = A_handle->backend_handle; - auto cu_x = x_handle->backend_handle; - auto cu_y = y_handle->backend_handle; - auto type = A_handle->value_container.data_type; - set_matrix_attributes("spsv_optimize", cu_a, A_view); - auto cu_op = get_cuda_operation(type, opA); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spsv_alg(alg); - auto cu_descr = spsv_descr->cu_descr; - set_pointer_mode(cu_handle, is_alpha_host_accessible); - auto status = cusparseSpSV_analysis(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, - cu_descr, workspace_ptr); - check_status(status, "spsv_optimize"); -} - void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, matrix_view A_view, matrix_handle_t A_handle, dense_vector_handle_t x_handle, dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, @@ -187,8 +192,8 @@ void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a if (!A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, - spsv_descr); + detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + y_handle, alg, spsv_descr); // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE // Copy the buffer to extend its lifetime until the descriptor is free'd. spsv_descr->workspace.set_buffer_untyped(workspace); @@ -196,22 +201,22 @@ void spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const void* a if (workspace.size() > 0) { auto functor = [=](sycl::interop_handle ih, sycl::accessor workspace_acc) { auto cu_handle = spsv_descr->cu_handle; - auto workspace_ptr = get_mem(ih, workspace_acc); - spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, - spsv_descr, workspace_ptr, is_alpha_host_accessible); + auto workspace_ptr = detail::get_mem(ih, workspace_acc); + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, + alg, spsv_descr, workspace_ptr, is_alpha_host_accessible); }; // The accessor can only be created if the buffer size is greater than 0 - dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); + detail::dispatch_submit(__func__, queue, functor, A_handle, workspace, x_handle, y_handle); } else { auto functor = [=](sycl::interop_handle) { auto cu_handle = spsv_descr->cu_handle; - spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, - spsv_descr, nullptr, is_alpha_host_accessible); + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, + alg, spsv_descr, nullptr, is_alpha_host_accessible); }; - dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); + detail::dispatch_submit(__func__, queue, functor, A_handle, x_handle, y_handle); } } @@ -224,16 +229,17 @@ sycl::event spsv_optimize(sycl::queue& queue, oneapi::mkl::transpose opA, const if (A_handle->all_use_buffer()) { detail::throw_incompatible_container(__func__); } - common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, y_handle, alg, - spsv_descr); + detail::common_spsv_optimize(opA, is_alpha_host_accessible, A_view, A_handle, x_handle, + y_handle, alg, spsv_descr); // Ignore spsv_alg::no_optimize_alg as this step is mandatory for cuSPARSE auto functor = [=](sycl::interop_handle) { auto cu_handle = spsv_descr->cu_handle; - spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, - spsv_descr, workspace, is_alpha_host_accessible); + detail::spsv_optimize_impl(cu_handle, opA, alpha, A_view, A_handle, x_handle, y_handle, alg, + spsv_descr, workspace, is_alpha_host_accessible); }; // No need to store the workspace USM pointer as the backend stores it already - return dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, y_handle); + return detail::dispatch_submit(__func__, queue, dependencies, functor, A_handle, x_handle, + y_handle); } sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alpha, @@ -241,7 +247,8 @@ sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alp dense_vector_handle_t y_handle, spsv_alg alg, spsv_descr_t spsv_descr, const std::vector& dependencies) { bool is_alpha_host_accessible = detail::is_ptr_accessible_on_host(queue, alpha); - check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, is_alpha_host_accessible); + detail::check_valid_spsv(__func__, A_view, A_handle, x_handle, y_handle, + is_alpha_host_accessible); if (A_handle->all_use_buffer() != spsv_descr->workspace.use_buffer()) { detail::throw_incompatible_container(__func__); } @@ -264,19 +271,19 @@ sycl::event spsv(sycl::queue& queue, oneapi::mkl::transpose opA, const void* alp auto cu_x = x_handle->backend_handle; auto cu_y = y_handle->backend_handle; auto type = A_handle->value_container.data_type; - set_matrix_attributes(__func__, cu_a, A_view); - auto cu_op = get_cuda_operation(type, opA); - auto cu_type = get_cuda_value_type(type); - auto cu_alg = get_cuda_spsv_alg(alg); + detail::set_matrix_attributes(__func__, cu_a, A_view); + auto cu_op = detail::get_cuda_operation(type, opA); + auto cu_type = detail::get_cuda_value_type(type); + auto cu_alg = detail::get_cuda_spsv_alg(alg); auto cu_descr = spsv_descr->cu_descr; - set_pointer_mode(cu_handle, is_alpha_host_accessible); + detail::set_pointer_mode(cu_handle, is_alpha_host_accessible); auto status = cusparseSpSV_solve(cu_handle, cu_op, alpha, cu_a, cu_x, cu_y, cu_type, cu_alg, cu_descr); - check_status(status, __func__); - synchronize_if_needed(is_in_order_queue, spsv_descr->cu_stream); + detail::check_status(status, __func__); + detail::synchronize_if_needed(is_in_order_queue, spsv_descr->cu_stream); }; - return dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, x_handle, - y_handle); + return detail::dispatch_submit_native_ext(__func__, queue, dependencies, functor, A_handle, + x_handle, y_handle); } } // namespace oneapi::mkl::sparse::cusparse From 288823245dc0419130bf5d8d138d9eaed565e312 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 14:45:19 +0200 Subject: [PATCH 40/43] Fix CT example return value and expected result --- .../sparse_blas_spmv_usm_mklcpu_cusparse.cpp | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp index 2f3be76ed..31ce1975c 100644 --- a/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp +++ b/examples/sparse_blas/compile_time_dispatching/sparse_blas_spmv_usm_mklcpu_cusparse.cpp @@ -176,22 +176,14 @@ int run_sparse_matrix_vector_multiply_example(selectorType& selector) { // Post Processing // + // The example assume matrices are not transposed and beta=0 for simplicity. + // See the tests for more in-depth verification. fpType* res = y; - fpType expected_res[size]; - const bool isConj = (transA == oneapi::mkl::transpose::conjtrans); - for (intType row = 0; row < size; row++) { - expected_res[row] *= beta; - } - for (intType row = 0; row < size; row++) { - fpType tmp = alpha * x[row]; - for (intType i = ia[row]; i < ia[row + 1]; i++) { - if constexpr (is_complex()) { - expected_res[ja[i]] += tmp * (isConj ? std::conj(a[i]) : a[i]); - } - else { - expected_res[ja[i]] += tmp * a[i]; - } - } + fpType expected_res[size] = {}; + for (intType i = 0; i < nnz; ++i) { + intType row = ia[i]; + intType col = ja[i]; + expected_res[row] += alpha * x[col] * a[i]; } bool good = true; @@ -277,8 +269,12 @@ int main(int /*argc*/, char** /*argv*/) { << std::endl; std::cout << "Running with single precision real data type:" << std::endl; - run_sparse_matrix_vector_multiply_example(cpu_selector); - run_sparse_matrix_vector_multiply_example(gpu_selector); + int err = run_sparse_matrix_vector_multiply_example(cpu_selector); + if (err) + return err; + err = run_sparse_matrix_vector_multiply_example(gpu_selector); + if (err) + return err; std::cout << "Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE." << std::endl; } catch (sycl::exception const& e) { From b4f553cc0112ae9252f00f1ff25b466725b81a95 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 15:28:34 +0200 Subject: [PATCH 41/43] Update example README output --- examples/README.md | 124 ++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/examples/README.md b/examples/README.md index 0dad8772d..45a100131 100644 --- a/examples/README.md +++ b/examples/README.md @@ -4,7 +4,7 @@ oneAPI Math Kernel Library (oneMKL) Interfaces offers examples with the followin - rng: uniform_usm - lapack: getrs_usm - dft: complex_fwd_usm, real_fwd_usm -- sparse_blas: sparse_gemv_usm +- sparse_blas: sparse_spmv_usm Each routine has one run-time dispatching example and one compile-time dispatching example (which uses both mklcpu and cuda backends), located in `example/<$domain>/run_time_dispatching` and `example/<$domain>/compile_time_dispatching` subfolders, respectively. @@ -487,111 +487,119 @@ Unsupported Configuration: Run-time dispatching examples with mklcpu backend ``` $ export ONEAPI_DEVICE_SELECTOR="opencl:cpu" -$ ./bin/example_sparse_blas_gemv_usm +$ ./bin/example_sparse_blas_spmv_usm ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# +# # where A is a sparse matrix in CSR format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# +# # Device will be selected during runtime. # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify # available devices -# +# ######################################################################## -Running Sparse BLAS GEMV USM example on CPU device. -Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz +Running Sparse BLAS SPMV USM example on CPU device. +Device name is: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + nrows = 64 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK. ``` Run-time dispatching examples with mklgpu backend ``` $ export ONEAPI_DEVICE_SELECTOR="level_zero:gpu" -$ ./bin/example_sparse_blas_gemv_usm +$ ./bin/example_sparse_blas_spmv_usm ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# +# # where A is a sparse matrix in CSR format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# +# # Device will be selected during runtime. # The environment variable ONEAPI_DEVICE_SELECTOR can be used to specify # available devices -# +# ######################################################################## -Running Sparse BLAS GEMV USM example on GPU device. +Running Sparse BLAS SPMV USM example on GPU device. Device name is: Intel(R) HD Graphics 530 [0x1912] Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + nrows = 64 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK. ``` -Compile-time dispatching example with mklcpu backend +Compile-time dispatching example with both mklcpu and cusparse backend ``` -$ export ONEAPI_DEVICE_SELECTOR="opencl:cpu" -$ ./bin/example_sparse_blas_gemv_usm_mklcpu +$ ./bin/sparse_blas_spmv_usm_mklcpu_cusparse ######################################################################## -# Sparse Matrix-Vector Multiply Example: -# +# Sparse Matrix-Vector Multiply Example: +# # y = alpha * op(A) * x + beta * y -# -# where A is a sparse matrix in CSR format, x and y are dense vectors +# +# where A is a sparse matrix in COO format, x and y are dense vectors # and alpha, beta are floating point type precision scalars. -# +# # Using apis: -# sparse::gemv -# +# sparse::spmv +# # Using single precision (float) data type -# -# Running on Intel CPU device -# +# +# Running on both Intel CPU and Nvidia GPU devices +# ######################################################################## -Running Sparse BLAS GEMV USM example on CPU device. -Device name is: Intel(R) Core(TM) i7-6700K CPU @ 4.00GHz +Running Sparse BLAS SPMV USM example on: + CPU device: Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz + GPU device: NVIDIA A100-PCIE-40GB Running with single precision real data type: - sparse::gemv parameters: - transA = nontrans - nrows = 64 - alpha = 1, beta = 0 + sparse::spmv parameters: + transA = nontrans + size = 8 + alpha = 1, beta = 0 + + sparse::spmv example passed + Finished + + sparse::spmv parameters: + transA = nontrans + size = 8 + alpha = 1, beta = 0 - sparse::gemv example passed - Finished -Sparse BLAS GEMV USM example ran OK. + sparse::spmv example passed + Finished +Sparse BLAS SPMV USM example ran OK on MKLCPU and CUSPARSE. ``` From a2177f7e1fd1e48e011193da142d21cc4f111326 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Fri, 25 Oct 2024 16:58:33 +0200 Subject: [PATCH 42/43] clang-format --- tests/unit_tests/sparse_blas/include/test_spmm.hpp | 4 ++-- tests/unit_tests/sparse_blas/include/test_spmv.hpp | 7 +++---- tests/unit_tests/sparse_blas/include/test_spsv.hpp | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/unit_tests/sparse_blas/include/test_spmm.hpp b/tests/unit_tests/sparse_blas/include/test_spmm.hpp index b17a26d21..153862f53 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmm.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmm.hpp @@ -194,8 +194,8 @@ void test_helper_with_format_with_transpose( } // In-order queue EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, - ncols_A, ncols_C, density_A_matrix, index_zero, col_major, transpose_A, + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A, + ncols_C, density_A_matrix, index_zero, col_major, transpose_A, transpose_B, fp_one, fp_zero, ldb, ldc, default_alg, default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); diff --git a/tests/unit_tests/sparse_blas/include/test_spmv.hpp b/tests/unit_tests/sparse_blas/include/test_spmv.hpp index 5d5f3a010..50b5aa7db 100644 --- a/tests/unit_tests/sparse_blas/include/test_spmv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spmv.hpp @@ -206,10 +206,9 @@ void test_helper_with_format_with_transpose( } // In-order queue EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, - ncols_A, density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, - default_alg, default_A_view, default_properties, no_reset_data, - no_scalars_on_device), + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, nrows_A, ncols_A, + density_A_matrix, index_zero, transpose_val, fp_one, fp_zero, default_alg, + default_A_view, default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } diff --git a/tests/unit_tests/sparse_blas/include/test_spsv.hpp b/tests/unit_tests/sparse_blas/include/test_spsv.hpp index ad2413f2f..94f5eacb1 100644 --- a/tests/unit_tests/sparse_blas/include/test_spsv.hpp +++ b/tests/unit_tests/sparse_blas/include/test_spsv.hpp @@ -155,9 +155,9 @@ void test_helper_with_format(testFunctorI32 test_functor_i32, testFunctorI64 tes } // In-order queue EXPECT_TRUE_OR_FUTURE_SKIP( - test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, m, - density_A_matrix, index_zero, transpose_val, alpha, default_alg, - default_A_view, default_properties, no_reset_data, no_scalars_on_device), + test_functor_i32(dev, { sycl::property::queue::in_order{} }, format, m, density_A_matrix, + index_zero, transpose_val, alpha, default_alg, default_A_view, + default_properties, no_reset_data, no_scalars_on_device), num_passed, num_skipped); } From eef836a9dd746cc58355e61419f7622d70a631f2 Mon Sep 17 00:00:00 2001 From: "romain.biessy" Date: Mon, 28 Oct 2024 16:21:52 +0000 Subject: [PATCH 43/43] Rename variables that are not placeholder anymore --- .../backends/cusparse/cusparse_task.hpp | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/src/sparse_blas/backends/cusparse/cusparse_task.hpp b/src/sparse_blas/backends/cusparse/cusparse_task.hpp index 0d170d47b..0d86d642d 100644 --- a/src/sparse_blas/backends/cusparse/cusparse_task.hpp +++ b/src/sparse_blas/backends/cusparse/cusparse_task.hpp @@ -79,7 +79,7 @@ void submit_host_task(sycl::handler& cgh, sycl::queue& queue, Functor functor, template void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, - sycl::accessor workspace_placeholder_acc, + sycl::accessor workspace_acc, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly // handled. The accessors's pointer have already been set to the native @@ -88,12 +88,12 @@ void submit_host_task_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor f // specification but should be true for all the implementations. This // assumption avoids the overhead of resetting the pointer of all data // handles for each enqueued command. - cgh.host_task([functor, queue, workspace_placeholder_acc, - capture_only_accessors...](sycl::interop_handle ih) { - auto unused = std::make_tuple(capture_only_accessors...); - (void)unused; - functor(ih, workspace_placeholder_acc); - }); + cgh.host_task( + [functor, queue, workspace_acc, capture_only_accessors...](sycl::interop_handle ih) { + auto unused = std::make_tuple(capture_only_accessors...); + (void)unused; + functor(ih, workspace_acc); + }); } template @@ -137,7 +137,7 @@ void submit_native_command_ext(sycl::handler& cgh, sycl::queue& queue, Functor f template void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, Functor functor, const std::vector& dependencies, - sycl::accessor workspace_placeholder_acc, + sycl::accessor workspace_acc, CaptureOnlyAcc... capture_only_accessors) { // Only capture the accessors to ensure the dependencies are properly // handled. The accessors's pointer have already been set to the native @@ -147,8 +147,7 @@ void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, // assumption avoids the overhead of resetting the pointer of all data // handles for each enqueued command. #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND - cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, - workspace_placeholder_acc, + cgh.ext_codeplay_enqueue_native_command([functor, queue, dependencies, workspace_acc, capture_only_accessors...](sycl::interop_handle ih) { auto unused = std::make_tuple(capture_only_accessors...); (void)unused; @@ -166,12 +165,11 @@ void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, for (auto event : dependencies) { event.wait(); } - functor(ih, workspace_placeholder_acc); + functor(ih, workspace_acc); }); #else (void)dependencies; - submit_host_task_with_acc(cgh, queue, functor, workspace_placeholder_acc, - capture_only_accessors...); + submit_host_task_with_acc(cgh, queue, functor, workspace_acc, capture_only_accessors...); #endif } @@ -179,9 +177,9 @@ void submit_native_command_ext_with_acc(sycl::handler& cgh, sycl::queue& queue, /// \p other_containers and ensure the dependencies of buffers are respected. /// The accessors are not directly used as the underlying data pointer has /// already been captured in previous functions. -/// \p workspace_placeholder_acc is a placeholder accessor that will be bound to -/// the cgh if not empty and given to the functor as a last argument. -/// \p UseWorkspace must be true to use the placeholder accessor. +/// \p workspace_buffer is an optional buffer. Its accessor will be given to the +/// functor as a last argument if \p UseWorkspace is true. +/// \p UseWorkspace must be true to use the given \p workspace_buffer. /// \p UseEnqueueNativeCommandExt controls whether host_task are used or the /// extension ext_codeplay_enqueue_native_command is used to launch tasks. The /// extension should only be used for asynchronous functions using native